blob: a7d9e1224c18bf80f79b22f9fdc8bab7bd240100 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
382 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100383 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100385 assert(maxchar <= 255);
386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 else
388 assert(maxchar < 128);
389 }
Victor Stinner77faf692011-11-20 18:56:05 +0100390 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200391 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100392 assert(maxchar <= 0xFFFF);
393 }
394 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 0x10000);
Victor Stinner0d3721d2011-11-22 03:27:53 +0100396 /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397 return characters outside the range U+0000-U+10FFFF. */
398 /* assert(maxchar <= 0x10FFFF); */
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinner3a50e702011-10-18 21:21:00 +0200489#ifdef HAVE_MBCS
490static OSVERSIONINFOEX winver;
491#endif
492
Thomas Wouters477c8d52006-05-27 19:21:47 +0000493/* --- Bloom Filters ----------------------------------------------------- */
494
495/* stuff to implement simple "bloom filters" for Unicode characters.
496 to keep things simple, we use a single bitmask, using the least 5
497 bits from each unicode characters as the bit index. */
498
499/* the linebreak mask is set up by Unicode_Init below */
500
Antoine Pitrouf068f942010-01-13 14:19:12 +0000501#if LONG_BIT >= 128
502#define BLOOM_WIDTH 128
503#elif LONG_BIT >= 64
504#define BLOOM_WIDTH 64
505#elif LONG_BIT >= 32
506#define BLOOM_WIDTH 32
507#else
508#error "LONG_BIT is smaller than 32"
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511#define BLOOM_MASK unsigned long
512
513static BLOOM_MASK bloom_linebreak;
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
516#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517
Benjamin Peterson29060642009-01-31 22:14:21 +0000518#define BLOOM_LINEBREAK(ch) \
519 ((ch) < 128U ? ascii_linebreak[(ch)] : \
520 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521
Alexander Belopolsky40018472011-02-26 01:02:56 +0000522Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524{
525 /* calculate simple bloom-style bitmask for a given unicode string */
526
Antoine Pitrouf068f942010-01-13 14:19:12 +0000527 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528 Py_ssize_t i;
529
530 mask = 0;
531 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000533
534 return mask;
535}
536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537#define BLOOM_MEMBER(mask, chr, str) \
538 (BLOOM(mask, chr) \
539 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200541/* Compilation of templated routines */
542
543#include "stringlib/asciilib.h"
544#include "stringlib/fastsearch.h"
545#include "stringlib/partition.h"
546#include "stringlib/split.h"
547#include "stringlib/count.h"
548#include "stringlib/find.h"
549#include "stringlib/find_max_char.h"
550#include "stringlib/localeutil.h"
551#include "stringlib/undef.h"
552
553#include "stringlib/ucs1lib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs2lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs4lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200583#include "stringlib/unicodedefs.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100587#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200588
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589/* --- Unicode Object ----------------------------------------------------- */
590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200592fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
595 Py_ssize_t size, Py_UCS4 ch,
596 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200597{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
599
600 switch (kind) {
601 case PyUnicode_1BYTE_KIND:
602 {
603 Py_UCS1 ch1 = (Py_UCS1) ch;
604 if (ch1 == ch)
605 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
606 else
607 return -1;
608 }
609 case PyUnicode_2BYTE_KIND:
610 {
611 Py_UCS2 ch2 = (Py_UCS2) ch;
612 if (ch2 == ch)
613 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
614 else
615 return -1;
616 }
617 case PyUnicode_4BYTE_KIND:
618 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
619 default:
620 assert(0);
621 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623}
624
Victor Stinnerfe226c02011-10-03 03:52:20 +0200625static PyObject*
626resize_compact(PyObject *unicode, Py_ssize_t length)
627{
628 Py_ssize_t char_size;
629 Py_ssize_t struct_size;
630 Py_ssize_t new_size;
631 int share_wstr;
632
633 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200634 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 struct_size = sizeof(PyASCIIObject);
637 else
638 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200639 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640
641 _Py_DEC_REFTOTAL;
642 _Py_ForgetReference(unicode);
643
644 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
650 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
651 if (unicode == NULL) {
652 PyObject_Del(unicode);
653 PyErr_NoMemory();
654 return NULL;
655 }
656 _Py_NewReference(unicode);
657 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200658 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
661 _PyUnicode_WSTR_LENGTH(unicode) = length;
662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
664 length, 0);
665 return unicode;
666}
667
Alexander Belopolsky40018472011-02-26 01:02:56 +0000668static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200669resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670{
Victor Stinner95663112011-10-04 01:03:50 +0200671 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000674
Victor Stinner95663112011-10-04 01:03:50 +0200675 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
677 if (PyUnicode_IS_READY(unicode)) {
678 Py_ssize_t char_size;
679 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200680 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 void *data;
682
683 data = _PyUnicode_DATA_ANY(unicode);
684 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200685 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
687 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200688 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
689 {
690 PyObject_DEL(_PyUnicode_UTF8(unicode));
691 _PyUnicode_UTF8(unicode) = NULL;
692 _PyUnicode_UTF8_LENGTH(unicode) = 0;
693 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694
695 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
696 PyErr_NoMemory();
697 return -1;
698 }
699 new_size = (length + 1) * char_size;
700
701 data = (PyObject *)PyObject_REALLOC(data, new_size);
702 if (data == NULL) {
703 PyErr_NoMemory();
704 return -1;
705 }
706 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200709 _PyUnicode_WSTR_LENGTH(unicode) = length;
710 }
711 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200712 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 _PyUnicode_UTF8_LENGTH(unicode) = length;
714 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 _PyUnicode_LENGTH(unicode) = length;
716 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200717 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200718 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 }
Victor Stinner95663112011-10-04 01:03:50 +0200722 assert(_PyUnicode_WSTR(unicode) != NULL);
723
724 /* check for integer overflow */
725 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
726 PyErr_NoMemory();
727 return -1;
728 }
729 wstr = _PyUnicode_WSTR(unicode);
730 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
731 if (!wstr) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 _PyUnicode_WSTR(unicode) = wstr;
736 _PyUnicode_WSTR(unicode)[length] = 0;
737 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200738 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000739 return 0;
740}
741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742static PyObject*
743resize_copy(PyObject *unicode, Py_ssize_t length)
744{
745 Py_ssize_t copy_length;
746 if (PyUnicode_IS_COMPACT(unicode)) {
747 PyObject *copy;
748 assert(PyUnicode_IS_READY(unicode));
749
750 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
751 if (copy == NULL)
752 return NULL;
753
754 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200755 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200756 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200757 }
758 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 assert(_PyUnicode_WSTR(unicode) != NULL);
761 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200762 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 if (w == NULL)
764 return NULL;
765 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
766 copy_length = Py_MIN(copy_length, length);
767 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
768 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200769 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 }
771}
772
Guido van Rossumd57fd912000-03-10 22:53:23 +0000773/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000774 Ux0000 terminated; some code (e.g. new_identifier)
775 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776
777 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000778 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000779
780*/
781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200782#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200783static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784#endif
785
Alexander Belopolsky40018472011-02-26 01:02:56 +0000786static PyUnicodeObject *
787_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788{
789 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791
Thomas Wouters477c8d52006-05-27 19:21:47 +0000792 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 if (length == 0 && unicode_empty != NULL) {
794 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200795 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 }
797
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000798 /* Ensure we won't overflow the size. */
799 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
800 return (PyUnicodeObject *)PyErr_NoMemory();
801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 if (length < 0) {
803 PyErr_SetString(PyExc_SystemError,
804 "Negative size passed to _PyUnicode_New");
805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806 }
807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808#ifdef Py_DEBUG
809 ++unicode_old_new_calls;
810#endif
811
812 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
813 if (unicode == NULL)
814 return NULL;
815 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
816 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
817 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 PyErr_NoMemory();
819 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
Jeremy Hyltond8082792003-09-16 19:41:39 +0000822 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000823 * the caller fails before initializing str -- unicode_resize()
824 * reads str[0], and the Keep-Alive optimization can keep memory
825 * allocated for str alive across a call to unicode_dealloc(unicode).
826 * We don't want unicode_resize to read uninitialized memory in
827 * that case.
828 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 _PyUnicode_WSTR(unicode)[0] = 0;
830 _PyUnicode_WSTR(unicode)[length] = 0;
831 _PyUnicode_WSTR_LENGTH(unicode) = length;
832 _PyUnicode_HASH(unicode) = -1;
833 _PyUnicode_STATE(unicode).interned = 0;
834 _PyUnicode_STATE(unicode).kind = 0;
835 _PyUnicode_STATE(unicode).compact = 0;
836 _PyUnicode_STATE(unicode).ready = 0;
837 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200838 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 _PyUnicode_UTF8(unicode) = NULL;
841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100842 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000844
Benjamin Peterson29060642009-01-31 22:14:21 +0000845 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000846 /* XXX UNREF/NEWREF interface should be more symmetrical */
847 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000848 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000849 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851}
852
Victor Stinnerf42dc442011-10-02 23:33:16 +0200853static const char*
854unicode_kind_name(PyObject *unicode)
855{
Victor Stinner42dfd712011-10-03 14:41:45 +0200856 /* don't check consistency: unicode_kind_name() is called from
857 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858 if (!PyUnicode_IS_COMPACT(unicode))
859 {
860 if (!PyUnicode_IS_READY(unicode))
861 return "wstr";
862 switch(PyUnicode_KIND(unicode))
863 {
864 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200865 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866 return "legacy ascii";
867 else
868 return "legacy latin1";
869 case PyUnicode_2BYTE_KIND:
870 return "legacy UCS2";
871 case PyUnicode_4BYTE_KIND:
872 return "legacy UCS4";
873 default:
874 return "<legacy invalid kind>";
875 }
876 }
877 assert(PyUnicode_IS_READY(unicode));
878 switch(PyUnicode_KIND(unicode))
879 {
880 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200881 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 return "ascii";
883 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200884 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200888 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 default:
890 return "<invalid compact kind>";
891 }
892}
893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200895static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896
897/* Functions wrapping macros for use in debugger */
898char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200899 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
902void *_PyUnicode_compact_data(void *unicode) {
903 return _PyUnicode_COMPACT_DATA(unicode);
904}
905void *_PyUnicode_data(void *unicode){
906 printf("obj %p\n", unicode);
907 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
908 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
909 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
910 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
911 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
912 return PyUnicode_DATA(unicode);
913}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200914
915void
916_PyUnicode_Dump(PyObject *op)
917{
918 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200919 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
920 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
921 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200922
Victor Stinnera849a4b2011-10-03 12:12:11 +0200923 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200924 {
925 if (ascii->state.ascii)
926 data = (ascii + 1);
927 else
928 data = (compact + 1);
929 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 else
931 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200932 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->wstr == data)
935 printf("shared ");
936 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200937
Victor Stinnera3b334d2011-10-03 13:53:37 +0200938 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 printf(" (%zu), ", compact->wstr_length);
940 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
941 printf("shared ");
942 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200943 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946#endif
947
948PyObject *
949PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
950{
951 PyObject *obj;
952 PyCompactUnicodeObject *unicode;
953 void *data;
954 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200955 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 Py_ssize_t char_size;
957 Py_ssize_t struct_size;
958
959 /* Optimization for empty strings */
960 if (size == 0 && unicode_empty != NULL) {
961 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200962 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 }
964
965#ifdef Py_DEBUG
966 ++unicode_new_new_calls;
967#endif
968
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 is_ascii = 0;
970 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 struct_size = sizeof(PyCompactUnicodeObject);
972 if (maxchar < 128) {
973 kind_state = PyUnicode_1BYTE_KIND;
974 char_size = 1;
975 is_ascii = 1;
976 struct_size = sizeof(PyASCIIObject);
977 }
978 else if (maxchar < 256) {
979 kind_state = PyUnicode_1BYTE_KIND;
980 char_size = 1;
981 }
982 else if (maxchar < 65536) {
983 kind_state = PyUnicode_2BYTE_KIND;
984 char_size = 2;
985 if (sizeof(wchar_t) == 2)
986 is_sharing = 1;
987 }
988 else {
989 kind_state = PyUnicode_4BYTE_KIND;
990 char_size = 4;
991 if (sizeof(wchar_t) == 4)
992 is_sharing = 1;
993 }
994
995 /* Ensure we won't overflow the size. */
996 if (size < 0) {
997 PyErr_SetString(PyExc_SystemError,
998 "Negative size passed to PyUnicode_New");
999 return NULL;
1000 }
1001 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1002 return PyErr_NoMemory();
1003
1004 /* Duplicated allocation code from _PyObject_New() instead of a call to
1005 * PyObject_New() so we are able to allocate space for the object and
1006 * it's data buffer.
1007 */
1008 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1009 if (obj == NULL)
1010 return PyErr_NoMemory();
1011 obj = PyObject_INIT(obj, &PyUnicode_Type);
1012 if (obj == NULL)
1013 return NULL;
1014
1015 unicode = (PyCompactUnicodeObject *)obj;
1016 if (is_ascii)
1017 data = ((PyASCIIObject*)obj) + 1;
1018 else
1019 data = unicode + 1;
1020 _PyUnicode_LENGTH(unicode) = size;
1021 _PyUnicode_HASH(unicode) = -1;
1022 _PyUnicode_STATE(unicode).interned = 0;
1023 _PyUnicode_STATE(unicode).kind = kind_state;
1024 _PyUnicode_STATE(unicode).compact = 1;
1025 _PyUnicode_STATE(unicode).ready = 1;
1026 _PyUnicode_STATE(unicode).ascii = is_ascii;
1027 if (is_ascii) {
1028 ((char*)data)[size] = 0;
1029 _PyUnicode_WSTR(unicode) = NULL;
1030 }
1031 else if (kind_state == PyUnicode_1BYTE_KIND) {
1032 ((char*)data)[size] = 0;
1033 _PyUnicode_WSTR(unicode) = NULL;
1034 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 }
1038 else {
1039 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001040 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 if (kind_state == PyUnicode_2BYTE_KIND)
1042 ((Py_UCS2*)data)[size] = 0;
1043 else /* kind_state == PyUnicode_4BYTE_KIND */
1044 ((Py_UCS4*)data)[size] = 0;
1045 if (is_sharing) {
1046 _PyUnicode_WSTR_LENGTH(unicode) = size;
1047 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1048 }
1049 else {
1050 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1051 _PyUnicode_WSTR(unicode) = NULL;
1052 }
1053 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001054 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 return obj;
1056}
1057
1058#if SIZEOF_WCHAR_T == 2
1059/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1060 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001061 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
1063 This function assumes that unicode can hold one more code point than wstr
1064 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001065static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068{
1069 const wchar_t *iter;
1070 Py_UCS4 *ucs4_out;
1071
Victor Stinner910337b2011-10-03 03:20:16 +02001072 assert(unicode != NULL);
1073 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1075 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1076
1077 for (iter = begin; iter < end; ) {
1078 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1079 _PyUnicode_GET_LENGTH(unicode)));
1080 if (*iter >= 0xD800 && *iter <= 0xDBFF
1081 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1082 {
1083 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1084 iter += 2;
1085 }
1086 else {
1087 *ucs4_out++ = *iter;
1088 iter++;
1089 }
1090 }
1091 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1092 _PyUnicode_GET_LENGTH(unicode)));
1093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094}
1095#endif
1096
Victor Stinnercd9950f2011-10-02 00:34:53 +02001097static int
1098_PyUnicode_Dirty(PyObject *unicode)
1099{
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001101 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001102 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103 "Cannot modify a string having more than 1 reference");
1104 return -1;
1105 }
1106 _PyUnicode_DIRTY(unicode);
1107 return 0;
1108}
1109
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001110static int
1111_copy_characters(PyObject *to, Py_ssize_t to_start,
1112 PyObject *from, Py_ssize_t from_start,
1113 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001115 unsigned int from_kind, to_kind;
1116 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001117 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001119 assert(PyUnicode_Check(from));
1120 assert(PyUnicode_Check(to));
1121 assert(PyUnicode_IS_READY(from));
1122 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001124 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1125 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1126 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001128 if (how_many == 0)
1129 return 0;
1130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136#ifdef Py_DEBUG
1137 if (!check_maxchar
1138 && (from_kind > to_kind
1139 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001140 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001141 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1142 Py_UCS4 ch;
1143 Py_ssize_t i;
1144 for (i=0; i < how_many; i++) {
1145 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1146 assert(ch <= to_maxchar);
1147 }
1148 }
1149#endif
1150 fast = (from_kind == to_kind);
1151 if (check_maxchar
1152 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1153 {
1154 /* deny latin1 => ascii */
1155 fast = 0;
1156 }
1157
1158 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001159 Py_MEMCPY((char*)to_data + to_kind * to_start,
1160 (char*)from_data + from_kind * from_start,
1161 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001163 else if (from_kind == PyUnicode_1BYTE_KIND
1164 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001165 {
1166 _PyUnicode_CONVERT_BYTES(
1167 Py_UCS1, Py_UCS2,
1168 PyUnicode_1BYTE_DATA(from) + from_start,
1169 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1170 PyUnicode_2BYTE_DATA(to) + to_start
1171 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001172 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001173 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001174 && to_kind == PyUnicode_4BYTE_KIND)
1175 {
1176 _PyUnicode_CONVERT_BYTES(
1177 Py_UCS1, Py_UCS4,
1178 PyUnicode_1BYTE_DATA(from) + from_start,
1179 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1180 PyUnicode_4BYTE_DATA(to) + to_start
1181 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001182 }
1183 else if (from_kind == PyUnicode_2BYTE_KIND
1184 && to_kind == PyUnicode_4BYTE_KIND)
1185 {
1186 _PyUnicode_CONVERT_BYTES(
1187 Py_UCS2, Py_UCS4,
1188 PyUnicode_2BYTE_DATA(from) + from_start,
1189 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1190 PyUnicode_4BYTE_DATA(to) + to_start
1191 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001193 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 /* check if max_char(from substring) <= max_char(to) */
1195 if (from_kind > to_kind
1196 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001197 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 /* slow path to check for character overflow */
1200 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 Py_ssize_t i;
1203
Victor Stinner56c161a2011-10-06 02:47:11 +02001204#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 for (i=0; i < how_many; i++) {
1206 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001207 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1209 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#else
1211 if (!check_maxchar) {
1212 for (i=0; i < how_many; i++) {
1213 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
1216 }
1217 else {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 if (ch > to_maxchar)
1221 return 1;
1222 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1223 }
1224 }
1225#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001227 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001228 assert(0 && "inconsistent state");
1229 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001230 }
1231 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001232 return 0;
1233}
1234
1235static void
1236copy_characters(PyObject *to, Py_ssize_t to_start,
1237 PyObject *from, Py_ssize_t from_start,
1238 Py_ssize_t how_many)
1239{
1240 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1241}
1242
1243Py_ssize_t
1244PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1245 PyObject *from, Py_ssize_t from_start,
1246 Py_ssize_t how_many)
1247{
1248 int err;
1249
1250 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1251 PyErr_BadInternalCall();
1252 return -1;
1253 }
1254
1255 if (PyUnicode_READY(from))
1256 return -1;
1257 if (PyUnicode_READY(to))
1258 return -1;
1259
1260 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1261 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1262 PyErr_Format(PyExc_SystemError,
1263 "Cannot write %zi characters at %zi "
1264 "in a string of %zi characters",
1265 how_many, to_start, PyUnicode_GET_LENGTH(to));
1266 return -1;
1267 }
1268
1269 if (how_many == 0)
1270 return 0;
1271
1272 if (_PyUnicode_Dirty(to))
1273 return -1;
1274
1275 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1276 if (err) {
1277 PyErr_Format(PyExc_SystemError,
1278 "Cannot copy %s characters "
1279 "into a string of %s characters",
1280 unicode_kind_name(from),
1281 unicode_kind_name(to));
1282 return -1;
1283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285}
1286
Victor Stinner17222162011-09-28 22:15:37 +02001287/* Find the maximum code point and count the number of surrogate pairs so a
1288 correct string length can be computed before converting a string to UCS4.
1289 This function counts single surrogates as a character and not as a pair.
1290
1291 Return 0 on success, or -1 on error. */
1292static int
1293find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1294 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295{
1296 const wchar_t *iter;
1297
Victor Stinnerc53be962011-10-02 21:33:54 +02001298 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 *num_surrogates = 0;
1300 *maxchar = 0;
1301
1302 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001303 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001305#if SIZEOF_WCHAR_T != 2
1306 if (*maxchar >= 0x10000)
1307 return 0;
1308#endif
1309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312 && (iter+1) < end
1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 {
1315 Py_UCS4 surrogate_val;
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 ++(*num_surrogates);
1318 if (surrogate_val > *maxchar)
1319 *maxchar = surrogate_val;
1320 iter += 2;
1321 }
1322 else
1323 iter++;
1324#else
1325 iter++;
1326#endif
1327 }
1328 return 0;
1329}
1330
1331#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001332static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333#endif
1334
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001335int
1336_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337{
1338 wchar_t *end;
1339 Py_UCS4 maxchar = 0;
1340 Py_ssize_t num_surrogates;
1341#if SIZEOF_WCHAR_T == 2
1342 Py_ssize_t length_wo_surrogates;
1343#endif
1344
Georg Brandl7597add2011-10-05 16:36:47 +02001345 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001346 strings were created using _PyObject_New() and where no canonical
1347 representation (the str field) has been set yet aka strings
1348 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001349 assert(_PyUnicode_CHECK(unicode));
1350 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001352 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001353 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001354 /* Actually, it should neither be interned nor be anything else: */
1355 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356
1357#ifdef Py_DEBUG
1358 ++unicode_ready_calls;
1359#endif
1360
1361 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001362 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
1366 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1368 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 PyErr_NoMemory();
1370 return -1;
1371 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001372 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 _PyUnicode_WSTR(unicode), end,
1374 PyUnicode_1BYTE_DATA(unicode));
1375 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1376 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1377 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1378 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001379 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001381 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 }
1383 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001384 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001385 _PyUnicode_UTF8(unicode) = NULL;
1386 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 PyObject_FREE(_PyUnicode_WSTR(unicode));
1389 _PyUnicode_WSTR(unicode) = NULL;
1390 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1391 }
1392 /* In this case we might have to convert down from 4-byte native
1393 wchar_t to 2-byte unicode. */
1394 else if (maxchar < 65536) {
1395 assert(num_surrogates == 0 &&
1396 "FindMaxCharAndNumSurrogatePairs() messed up");
1397
Victor Stinner506f5922011-09-28 22:34:18 +02001398#if SIZEOF_WCHAR_T == 2
1399 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001401 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1402 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1403 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001406#else
1407 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001409 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001411 PyErr_NoMemory();
1412 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 }
Victor Stinner506f5922011-09-28 22:34:18 +02001414 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1415 _PyUnicode_WSTR(unicode), end,
1416 PyUnicode_2BYTE_DATA(unicode));
1417 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1418 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1419 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001420 _PyUnicode_UTF8(unicode) = NULL;
1421 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyObject_FREE(_PyUnicode_WSTR(unicode));
1423 _PyUnicode_WSTR(unicode) = NULL;
1424 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1425#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
1427 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1428 else {
1429#if SIZEOF_WCHAR_T == 2
1430 /* in case the native representation is 2-bytes, we need to allocate a
1431 new normalized 4-byte version. */
1432 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001433 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1434 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435 PyErr_NoMemory();
1436 return -1;
1437 }
1438 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1439 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001442 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1443 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001444 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 PyObject_FREE(_PyUnicode_WSTR(unicode));
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1448#else
1449 assert(num_surrogates == 0);
1450
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1456#endif
1457 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1458 }
1459 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001460 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 return 0;
1462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001465unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466{
Walter Dörwald16807132007-05-25 13:52:07 +00001467 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 case SSTATE_NOT_INTERNED:
1469 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001470
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 case SSTATE_INTERNED_MORTAL:
1472 /* revive dead object temporarily for DelItem */
1473 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001474 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 Py_FatalError(
1476 "deletion of interned string failed");
1477 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_INTERNED_IMMORTAL:
1480 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 default:
1483 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001484 }
1485
Victor Stinner03490912011-10-03 23:45:12 +02001486 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001488 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001489 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490
1491 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001492 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 }
1494 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001495 if (_PyUnicode_DATA_ANY(unicode))
1496 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001497 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 }
1499}
1500
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001501#ifdef Py_DEBUG
1502static int
1503unicode_is_singleton(PyObject *unicode)
1504{
1505 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1506 if (unicode == unicode_empty)
1507 return 1;
1508 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1509 {
1510 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1511 if (ch < 256 && unicode_latin1[ch] == unicode)
1512 return 1;
1513 }
1514 return 0;
1515}
1516#endif
1517
Alexander Belopolsky40018472011-02-26 01:02:56 +00001518static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001519unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001520{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001521 if (Py_REFCNT(unicode) != 1)
1522 return 0;
1523 if (PyUnicode_CHECK_INTERNED(unicode))
1524 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001525#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001526 /* singleton refcount is greater than 1 */
1527 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001528#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 return 1;
1530}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532static int
1533unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1534{
1535 PyObject *unicode;
1536 Py_ssize_t old_length;
1537
1538 assert(p_unicode != NULL);
1539 unicode = *p_unicode;
1540
1541 assert(unicode != NULL);
1542 assert(PyUnicode_Check(unicode));
1543 assert(0 <= length);
1544
Victor Stinner910337b2011-10-03 03:20:16 +02001545 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001546 old_length = PyUnicode_WSTR_LENGTH(unicode);
1547 else
1548 old_length = PyUnicode_GET_LENGTH(unicode);
1549 if (old_length == length)
1550 return 0;
1551
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001552 if (length == 0) {
1553 Py_DECREF(*p_unicode);
1554 *p_unicode = unicode_empty;
1555 Py_INCREF(*p_unicode);
1556 return 0;
1557 }
1558
Victor Stinnerfe226c02011-10-03 03:52:20 +02001559 if (!unicode_resizable(unicode)) {
1560 PyObject *copy = resize_copy(unicode, length);
1561 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001563 Py_DECREF(*p_unicode);
1564 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566 }
1567
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 if (PyUnicode_IS_COMPACT(unicode)) {
1569 *p_unicode = resize_compact(unicode, length);
1570 if (*p_unicode == NULL)
1571 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001572 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001574 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001575 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001576}
1577
Alexander Belopolsky40018472011-02-26 01:02:56 +00001578int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001580{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 PyObject *unicode;
1582 if (p_unicode == NULL) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001587 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 {
1589 PyErr_BadInternalCall();
1590 return -1;
1591 }
1592 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001595static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001596unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001597{
1598 PyObject *result;
1599 assert(PyUnicode_IS_READY(*p_unicode));
1600 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1601 return 0;
1602 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1603 maxchar);
1604 if (result == NULL)
1605 return -1;
1606 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1607 PyUnicode_GET_LENGTH(*p_unicode));
1608 Py_DECREF(*p_unicode);
1609 *p_unicode = result;
1610 return 0;
1611}
1612
1613static int
1614unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1615 Py_UCS4 ch)
1616{
1617 if (unicode_widen(p_unicode, ch) < 0)
1618 return -1;
1619 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1620 PyUnicode_DATA(*p_unicode),
1621 (*pos)++, ch);
1622 return 0;
1623}
1624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625static PyObject*
1626get_latin1_char(unsigned char ch)
1627{
Victor Stinnera464fc12011-10-02 20:39:30 +02001628 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001630 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 if (!unicode)
1632 return NULL;
1633 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001634 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 unicode_latin1[ch] = unicode;
1636 }
1637 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001638 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639}
1640
Alexander Belopolsky40018472011-02-26 01:02:56 +00001641PyObject *
1642PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001644 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 Py_UCS4 maxchar = 0;
1646 Py_ssize_t num_surrogates;
1647
1648 if (u == NULL)
1649 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001651 /* If the Unicode data is known at construction time, we can apply
1652 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 /* Optimization for empty strings */
1655 if (size == 0 && unicode_empty != NULL) {
1656 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001657 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 }
Tim Petersced69f82003-09-16 20:30:58 +00001659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 /* Single character Unicode objects in the Latin-1 range are
1661 shared when using this constructor */
1662 if (size == 1 && *u < 256)
1663 return get_latin1_char((unsigned char)*u);
1664
1665 /* If not empty and not single character, copy the Unicode data
1666 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001667 if (find_maxchar_surrogates(u, u + size,
1668 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 return NULL;
1670
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001671 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 if (!unicode)
1674 return NULL;
1675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 switch (PyUnicode_KIND(unicode)) {
1677 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001678 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1680 break;
1681 case PyUnicode_2BYTE_KIND:
1682#if Py_UNICODE_SIZE == 2
1683 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1684#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001685 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1687#endif
1688 break;
1689 case PyUnicode_4BYTE_KIND:
1690#if SIZEOF_WCHAR_T == 2
1691 /* This is the only case which has to process surrogates, thus
1692 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001693 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694#else
1695 assert(num_surrogates == 0);
1696 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1697#endif
1698 break;
1699 default:
1700 assert(0 && "Impossible state");
1701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001703 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704}
1705
Alexander Belopolsky40018472011-02-26 01:02:56 +00001706PyObject *
1707PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001709 if (size < 0) {
1710 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001711 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001712 return NULL;
1713 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001714
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001715 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001716 some optimizations which share commonly used objects.
1717 Also, this means the input must be UTF-8, so fall back to the
1718 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719 if (u != NULL) {
1720
Benjamin Peterson29060642009-01-31 22:14:21 +00001721 /* Optimization for empty strings */
1722 if (size == 0 && unicode_empty != NULL) {
1723 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001724 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001726
1727 /* Single characters are shared when using this constructor.
1728 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001729 if (size == 1 && (unsigned char)*u < 128)
1730 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001731
1732 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001733 }
1734
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001735 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001736}
1737
Alexander Belopolsky40018472011-02-26 01:02:56 +00001738PyObject *
1739PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001740{
1741 size_t size = strlen(u);
1742 if (size > PY_SSIZE_T_MAX) {
1743 PyErr_SetString(PyExc_OverflowError, "input too long");
1744 return NULL;
1745 }
1746
1747 return PyUnicode_FromStringAndSize(u, size);
1748}
1749
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750PyObject *
1751_PyUnicode_FromId(_Py_Identifier *id)
1752{
1753 if (!id->object) {
1754 id->object = PyUnicode_FromString(id->string);
1755 if (!id->object)
1756 return NULL;
1757 PyUnicode_InternInPlace(&id->object);
1758 assert(!id->next);
1759 id->next = static_strings;
1760 static_strings = id;
1761 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001762 return id->object;
1763}
1764
1765void
1766_PyUnicode_ClearStaticStrings()
1767{
1768 _Py_Identifier *i;
1769 for (i = static_strings; i; i = i->next) {
1770 Py_DECREF(i->object);
1771 i->object = NULL;
1772 i->next = NULL;
1773 }
1774}
1775
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001776/* Internal function, don't check maximum character */
1777
Victor Stinnere57b1c02011-09-28 22:20:48 +02001778static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001779unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001780{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001781 PyObject *res;
1782#ifdef Py_DEBUG
1783 const unsigned char *p;
1784 const unsigned char *end = s + size;
1785 for (p=s; p < end; p++) {
1786 assert(*p < 128);
1787 }
1788#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 if (size == 1)
1790 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001791 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001792 if (!res)
1793 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001794 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001795 return res;
1796}
1797
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001798static Py_UCS4
1799kind_maxchar_limit(unsigned int kind)
1800{
1801 switch(kind) {
1802 case PyUnicode_1BYTE_KIND:
1803 return 0x80;
1804 case PyUnicode_2BYTE_KIND:
1805 return 0x100;
1806 case PyUnicode_4BYTE_KIND:
1807 return 0x10000;
1808 default:
1809 assert(0 && "invalid kind");
1810 return 0x10ffff;
1811 }
1812}
1813
Victor Stinner702c7342011-10-05 13:50:52 +02001814static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001815_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001818 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001819
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 if (size == 0) {
1821 Py_INCREF(unicode_empty);
1822 return unicode_empty;
1823 }
1824 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001825 if (size == 1)
1826 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001828 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001829 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 if (!res)
1831 return NULL;
1832 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001833 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001835}
1836
Victor Stinnere57b1c02011-09-28 22:20:48 +02001837static PyObject*
1838_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839{
1840 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001841 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001842
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843 if (size == 0) {
1844 Py_INCREF(unicode_empty);
1845 return unicode_empty;
1846 }
1847 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001848 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001849 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001850
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001852 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853 if (!res)
1854 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001855 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001857 else {
1858 _PyUnicode_CONVERT_BYTES(
1859 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1860 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001861 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 return res;
1863}
1864
Victor Stinnere57b1c02011-09-28 22:20:48 +02001865static PyObject*
1866_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867{
1868 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001869 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001870
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001871 if (size == 0) {
1872 Py_INCREF(unicode_empty);
1873 return unicode_empty;
1874 }
1875 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001876 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001877 return get_latin1_char((unsigned char)u[0]);
1878
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001879 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001880 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 if (!res)
1882 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001883 if (max_char < 256)
1884 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1885 PyUnicode_1BYTE_DATA(res));
1886 else if (max_char < 0x10000)
1887 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1888 PyUnicode_2BYTE_DATA(res));
1889 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001891 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 return res;
1893}
1894
1895PyObject*
1896PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1897{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001898 if (size < 0) {
1899 PyErr_SetString(PyExc_ValueError, "size must be positive");
1900 return NULL;
1901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 switch(kind) {
1903 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001904 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001908 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001909 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910 PyErr_SetString(PyExc_SystemError, "invalid kind");
1911 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913}
1914
Victor Stinner25a4b292011-10-06 12:31:55 +02001915/* Ensure that a string uses the most efficient storage, if it is not the
1916 case: create a new string with of the right kind. Write NULL into *p_unicode
1917 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001918static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001919unicode_adjust_maxchar(PyObject **p_unicode)
1920{
1921 PyObject *unicode, *copy;
1922 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001923 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001924 unsigned int kind;
1925
1926 assert(p_unicode != NULL);
1927 unicode = *p_unicode;
1928 assert(PyUnicode_IS_READY(unicode));
1929 if (PyUnicode_IS_ASCII(unicode))
1930 return;
1931
1932 len = PyUnicode_GET_LENGTH(unicode);
1933 kind = PyUnicode_KIND(unicode);
1934 if (kind == PyUnicode_1BYTE_KIND) {
1935 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 max_char = ucs1lib_find_max_char(u, u + len);
1937 if (max_char >= 128)
1938 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001939 }
1940 else if (kind == PyUnicode_2BYTE_KIND) {
1941 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs2lib_find_max_char(u, u + len);
1943 if (max_char >= 256)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
1946 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001947 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001948 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 max_char = ucs4lib_find_max_char(u, u + len);
1950 if (max_char >= 0x10000)
1951 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001952 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001953 copy = PyUnicode_New(len, max_char);
1954 copy_characters(copy, 0, unicode, 0, len);
1955 Py_DECREF(unicode);
1956 *p_unicode = copy;
1957}
1958
Victor Stinner034f6cf2011-09-30 02:26:44 +02001959PyObject*
1960PyUnicode_Copy(PyObject *unicode)
1961{
Victor Stinner87af4f22011-11-21 23:03:47 +01001962 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001963 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner034f6cf2011-09-30 02:26:44 +02001965 if (!PyUnicode_Check(unicode)) {
1966 PyErr_BadInternalCall();
1967 return NULL;
1968 }
1969 if (PyUnicode_READY(unicode))
1970 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001971
Victor Stinner87af4f22011-11-21 23:03:47 +01001972 length = PyUnicode_GET_LENGTH(unicode);
1973 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 if (!copy)
1975 return NULL;
1976 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1977
Victor Stinner87af4f22011-11-21 23:03:47 +01001978 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1979 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001981 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001982}
1983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984
Victor Stinnerbc603d12011-10-02 01:00:40 +02001985/* Widen Unicode objects to larger buffers. Don't write terminating null
1986 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987
1988void*
1989_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1990{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001991 Py_ssize_t len;
1992 void *result;
1993 unsigned int skind;
1994
1995 if (PyUnicode_READY(s))
1996 return NULL;
1997
1998 len = PyUnicode_GET_LENGTH(s);
1999 skind = PyUnicode_KIND(s);
2000 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002001 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 return NULL;
2003 }
2004 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002005 case PyUnicode_2BYTE_KIND:
2006 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2007 if (!result)
2008 return PyErr_NoMemory();
2009 assert(skind == PyUnicode_1BYTE_KIND);
2010 _PyUnicode_CONVERT_BYTES(
2011 Py_UCS1, Py_UCS2,
2012 PyUnicode_1BYTE_DATA(s),
2013 PyUnicode_1BYTE_DATA(s) + len,
2014 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002016 case PyUnicode_4BYTE_KIND:
2017 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2018 if (!result)
2019 return PyErr_NoMemory();
2020 if (skind == PyUnicode_2BYTE_KIND) {
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS2, Py_UCS4,
2023 PyUnicode_2BYTE_DATA(s),
2024 PyUnicode_2BYTE_DATA(s) + len,
2025 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002027 else {
2028 assert(skind == PyUnicode_1BYTE_KIND);
2029 _PyUnicode_CONVERT_BYTES(
2030 Py_UCS1, Py_UCS4,
2031 PyUnicode_1BYTE_DATA(s),
2032 PyUnicode_1BYTE_DATA(s) + len,
2033 result);
2034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002036 default:
2037 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 }
Victor Stinner01698042011-10-04 00:04:26 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 return NULL;
2041}
2042
2043static Py_UCS4*
2044as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2045 int copy_null)
2046{
2047 int kind;
2048 void *data;
2049 Py_ssize_t len, targetlen;
2050 if (PyUnicode_READY(string) == -1)
2051 return NULL;
2052 kind = PyUnicode_KIND(string);
2053 data = PyUnicode_DATA(string);
2054 len = PyUnicode_GET_LENGTH(string);
2055 targetlen = len;
2056 if (copy_null)
2057 targetlen++;
2058 if (!target) {
2059 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2060 PyErr_NoMemory();
2061 return NULL;
2062 }
2063 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2064 if (!target) {
2065 PyErr_NoMemory();
2066 return NULL;
2067 }
2068 }
2069 else {
2070 if (targetsize < targetlen) {
2071 PyErr_Format(PyExc_SystemError,
2072 "string is longer than the buffer");
2073 if (copy_null && 0 < targetsize)
2074 target[0] = 0;
2075 return NULL;
2076 }
2077 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002078 if (kind == PyUnicode_1BYTE_KIND) {
2079 Py_UCS1 *start = (Py_UCS1 *) data;
2080 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 else if (kind == PyUnicode_2BYTE_KIND) {
2083 Py_UCS2 *start = (Py_UCS2 *) data;
2084 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2085 }
2086 else {
2087 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 if (copy_null)
2091 target[len] = 0;
2092 return target;
2093}
2094
2095Py_UCS4*
2096PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2097 int copy_null)
2098{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002099 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 PyErr_BadInternalCall();
2101 return NULL;
2102 }
2103 return as_ucs4(string, target, targetsize, copy_null);
2104}
2105
2106Py_UCS4*
2107PyUnicode_AsUCS4Copy(PyObject *string)
2108{
2109 return as_ucs4(string, NULL, 0, 1);
2110}
2111
2112#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002113
Alexander Belopolsky40018472011-02-26 01:02:56 +00002114PyObject *
2115PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002118 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002120 PyErr_BadInternalCall();
2121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 }
2123
Martin v. Löwis790465f2008-04-05 20:41:37 +00002124 if (size == -1) {
2125 size = wcslen(w);
2126 }
2127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129}
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002132
Walter Dörwald346737f2007-05-31 10:44:43 +00002133static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002134makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2135 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 *fmt++ = '%';
2138 if (width) {
2139 if (zeropad)
2140 *fmt++ = '0';
2141 fmt += sprintf(fmt, "%d", width);
2142 }
2143 if (precision)
2144 fmt += sprintf(fmt, ".%d", precision);
2145 if (longflag)
2146 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002147 else if (longlongflag) {
2148 /* longlongflag should only ever be nonzero on machines with
2149 HAVE_LONG_LONG defined */
2150#ifdef HAVE_LONG_LONG
2151 char *f = PY_FORMAT_LONG_LONG;
2152 while (*f)
2153 *fmt++ = *f++;
2154#else
2155 /* we shouldn't ever get here */
2156 assert(0);
2157 *fmt++ = 'l';
2158#endif
2159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002160 else if (size_tflag) {
2161 char *f = PY_FORMAT_SIZE_T;
2162 while (*f)
2163 *fmt++ = *f++;
2164 }
2165 *fmt++ = c;
2166 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002167}
2168
Victor Stinner96865452011-03-01 23:44:09 +00002169/* helper for PyUnicode_FromFormatV() */
2170
2171static const char*
2172parse_format_flags(const char *f,
2173 int *p_width, int *p_precision,
2174 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2175{
2176 int width, precision, longflag, longlongflag, size_tflag;
2177
2178 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2179 f++;
2180 width = 0;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 width = (width*10) + *f++ - '0';
2183 precision = 0;
2184 if (*f == '.') {
2185 f++;
2186 while (Py_ISDIGIT((unsigned)*f))
2187 precision = (precision*10) + *f++ - '0';
2188 if (*f == '%') {
2189 /* "%.3%s" => f points to "3" */
2190 f--;
2191 }
2192 }
2193 if (*f == '\0') {
2194 /* bogus format "%.1" => go backward, f points to "1" */
2195 f--;
2196 }
2197 if (p_width != NULL)
2198 *p_width = width;
2199 if (p_precision != NULL)
2200 *p_precision = precision;
2201
2202 /* Handle %ld, %lu, %lld and %llu. */
2203 longflag = 0;
2204 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002205 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002206
2207 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002208 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002209 longflag = 1;
2210 ++f;
2211 }
2212#ifdef HAVE_LONG_LONG
2213 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002214 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002215 longlongflag = 1;
2216 f += 2;
2217 }
2218#endif
2219 }
2220 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002221 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002222 size_tflag = 1;
2223 ++f;
2224 }
2225 if (p_longflag != NULL)
2226 *p_longflag = longflag;
2227 if (p_longlongflag != NULL)
2228 *p_longlongflag = longlongflag;
2229 if (p_size_tflag != NULL)
2230 *p_size_tflag = size_tflag;
2231 return f;
2232}
2233
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002234/* maximum number of characters required for output of %ld. 21 characters
2235 allows for 64-bit integers (in decimal) and an optional sign. */
2236#define MAX_LONG_CHARS 21
2237/* maximum number of characters required for output of %lld.
2238 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2239 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2240#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2241
Walter Dörwaldd2034312007-05-18 16:29:38 +00002242PyObject *
2243PyUnicode_FromFormatV(const char *format, va_list vargs)
2244{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 va_list count;
2246 Py_ssize_t callcount = 0;
2247 PyObject **callresults = NULL;
2248 PyObject **callresult = NULL;
2249 Py_ssize_t n = 0;
2250 int width = 0;
2251 int precision = 0;
2252 int zeropad;
2253 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002254 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002256 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2258 Py_UCS4 argmaxchar;
2259 Py_ssize_t numbersize = 0;
2260 char *numberresults = NULL;
2261 char *numberresult = NULL;
2262 Py_ssize_t i;
2263 int kind;
2264 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002265
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002266 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002267 /* step 1: count the number of %S/%R/%A/%s format specifications
2268 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2269 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002271 * also estimate a upper bound for all the number formats in the string,
2272 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 for (f = format; *f; f++) {
2275 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002276 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2278 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2279 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2280 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002283#ifdef HAVE_LONG_LONG
2284 if (longlongflag) {
2285 if (width < MAX_LONG_LONG_CHARS)
2286 width = MAX_LONG_LONG_CHARS;
2287 }
2288 else
2289#endif
2290 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2291 including sign. Decimal takes the most space. This
2292 isn't enough for octal. If a width is specified we
2293 need more (which we allocate later). */
2294 if (width < MAX_LONG_CHARS)
2295 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296
2297 /* account for the size + '\0' to separate numbers
2298 inside of the numberresults buffer */
2299 numbersize += (width + 1);
2300 }
2301 }
2302 else if ((unsigned char)*f > 127) {
2303 PyErr_Format(PyExc_ValueError,
2304 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2305 "string, got a non-ASCII byte: 0x%02x",
2306 (unsigned char)*f);
2307 return NULL;
2308 }
2309 }
2310 /* step 2: allocate memory for the results of
2311 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2312 if (callcount) {
2313 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2314 if (!callresults) {
2315 PyErr_NoMemory();
2316 return NULL;
2317 }
2318 callresult = callresults;
2319 }
2320 /* step 2.5: allocate memory for the results of formating numbers */
2321 if (numbersize) {
2322 numberresults = PyObject_Malloc(numbersize);
2323 if (!numberresults) {
2324 PyErr_NoMemory();
2325 goto fail;
2326 }
2327 numberresult = numberresults;
2328 }
2329
2330 /* step 3: format numbers and figure out how large a buffer we need */
2331 for (f = format; *f; f++) {
2332 if (*f == '%') {
2333 const char* p;
2334 int longflag;
2335 int longlongflag;
2336 int size_tflag;
2337 int numprinted;
2338
2339 p = f;
2340 zeropad = (f[1] == '0');
2341 f = parse_format_flags(f, &width, &precision,
2342 &longflag, &longlongflag, &size_tflag);
2343 switch (*f) {
2344 case 'c':
2345 {
2346 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002347 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 n++;
2349 break;
2350 }
2351 case '%':
2352 n++;
2353 break;
2354 case 'i':
2355 case 'd':
2356 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2357 width, precision, *f);
2358 if (longflag)
2359 numprinted = sprintf(numberresult, fmt,
2360 va_arg(count, long));
2361#ifdef HAVE_LONG_LONG
2362 else if (longlongflag)
2363 numprinted = sprintf(numberresult, fmt,
2364 va_arg(count, PY_LONG_LONG));
2365#endif
2366 else if (size_tflag)
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, Py_ssize_t));
2369 else
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, int));
2372 n += numprinted;
2373 /* advance by +1 to skip over the '\0' */
2374 numberresult += (numprinted + 1);
2375 assert(*(numberresult - 1) == '\0');
2376 assert(*(numberresult - 2) != '\0');
2377 assert(numprinted >= 0);
2378 assert(numberresult <= numberresults + numbersize);
2379 break;
2380 case 'u':
2381 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2382 width, precision, 'u');
2383 if (longflag)
2384 numprinted = sprintf(numberresult, fmt,
2385 va_arg(count, unsigned long));
2386#ifdef HAVE_LONG_LONG
2387 else if (longlongflag)
2388 numprinted = sprintf(numberresult, fmt,
2389 va_arg(count, unsigned PY_LONG_LONG));
2390#endif
2391 else if (size_tflag)
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, size_t));
2394 else
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned int));
2397 n += numprinted;
2398 numberresult += (numprinted + 1);
2399 assert(*(numberresult - 1) == '\0');
2400 assert(*(numberresult - 2) != '\0');
2401 assert(numprinted >= 0);
2402 assert(numberresult <= numberresults + numbersize);
2403 break;
2404 case 'x':
2405 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2406 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2407 n += numprinted;
2408 numberresult += (numprinted + 1);
2409 assert(*(numberresult - 1) == '\0');
2410 assert(*(numberresult - 2) != '\0');
2411 assert(numprinted >= 0);
2412 assert(numberresult <= numberresults + numbersize);
2413 break;
2414 case 'p':
2415 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2416 /* %p is ill-defined: ensure leading 0x. */
2417 if (numberresult[1] == 'X')
2418 numberresult[1] = 'x';
2419 else if (numberresult[1] != 'x') {
2420 memmove(numberresult + 2, numberresult,
2421 strlen(numberresult) + 1);
2422 numberresult[0] = '0';
2423 numberresult[1] = 'x';
2424 numprinted += 2;
2425 }
2426 n += numprinted;
2427 numberresult += (numprinted + 1);
2428 assert(*(numberresult - 1) == '\0');
2429 assert(*(numberresult - 2) != '\0');
2430 assert(numprinted >= 0);
2431 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 break;
2433 case 's':
2434 {
2435 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002436 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002437 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2438 if (!str)
2439 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 /* since PyUnicode_DecodeUTF8 returns already flexible
2441 unicode objects, there is no need to call ready on them */
2442 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002443 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002445 /* Remember the str and switch to the next slot */
2446 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 break;
2448 }
2449 case 'U':
2450 {
2451 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002452 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 if (PyUnicode_READY(obj) == -1)
2454 goto fail;
2455 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002456 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 break;
2459 }
2460 case 'V':
2461 {
2462 PyObject *obj = va_arg(count, PyObject *);
2463 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002464 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002466 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002467 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (PyUnicode_READY(obj) == -1)
2469 goto fail;
2470 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002471 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002473 *callresult++ = NULL;
2474 }
2475 else {
2476 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2477 if (!str_obj)
2478 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002479 if (PyUnicode_READY(str_obj)) {
2480 Py_DECREF(str_obj);
2481 goto fail;
2482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002484 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002486 *callresult++ = str_obj;
2487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002488 break;
2489 }
2490 case 'S':
2491 {
2492 PyObject *obj = va_arg(count, PyObject *);
2493 PyObject *str;
2494 assert(obj);
2495 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002499 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 /* Remember the str and switch to the next slot */
2502 *callresult++ = str;
2503 break;
2504 }
2505 case 'R':
2506 {
2507 PyObject *obj = va_arg(count, PyObject *);
2508 PyObject *repr;
2509 assert(obj);
2510 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002514 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 /* Remember the repr and switch to the next slot */
2517 *callresult++ = repr;
2518 break;
2519 }
2520 case 'A':
2521 {
2522 PyObject *obj = va_arg(count, PyObject *);
2523 PyObject *ascii;
2524 assert(obj);
2525 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002529 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 /* Remember the repr and switch to the next slot */
2532 *callresult++ = ascii;
2533 break;
2534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 default:
2536 /* if we stumble upon an unknown
2537 formatting code, copy the rest of
2538 the format string to the output
2539 string. (we cannot just skip the
2540 code, since there's no way to know
2541 what's in the argument list) */
2542 n += strlen(p);
2543 goto expand;
2544 }
2545 } else
2546 n++;
2547 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002551 we don't have to resize the string.
2552 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002553 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002554 if (!string)
2555 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 kind = PyUnicode_KIND(string);
2557 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002563 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002564
2565 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2567 /* checking for == because the last argument could be a empty
2568 string, which causes i to point to end, the assert at the end of
2569 the loop */
2570 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002571
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 switch (*f) {
2573 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002574 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 const int ordinal = va_arg(vargs, int);
2576 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002578 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002579 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583 case 'p':
2584 /* unused, since we already have the result */
2585 if (*f == 'p')
2586 (void) va_arg(vargs, void *);
2587 else
2588 (void) va_arg(vargs, int);
2589 /* extract the result from numberresults and append. */
2590 for (; *numberresult; ++i, ++numberresult)
2591 PyUnicode_WRITE(kind, data, i, *numberresult);
2592 /* skip over the separating '\0' */
2593 assert(*numberresult == '\0');
2594 numberresult++;
2595 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 break;
2597 case 's':
2598 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002599 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 size = PyUnicode_GET_LENGTH(*callresult);
2603 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002604 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* We're done with the unicode()/repr() => forget it */
2607 Py_DECREF(*callresult);
2608 /* switch to next unicode()/repr() result */
2609 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 break;
2611 }
2612 case 'U':
2613 {
2614 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 Py_ssize_t size;
2616 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2617 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002618 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 break;
2621 }
2622 case 'V':
2623 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002625 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002626 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(obj);
2629 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002630 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 size = PyUnicode_GET_LENGTH(*callresult);
2634 assert(PyUnicode_KIND(*callresult) <=
2635 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002638 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002640 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 break;
2642 }
2643 case 'S':
2644 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002645 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002647 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* unused, since we already have the result */
2649 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002651 copy_characters(string, i, *callresult, 0, size);
2652 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 /* We're done with the unicode()/repr() => forget it */
2654 Py_DECREF(*callresult);
2655 /* switch to next unicode()/repr() result */
2656 ++callresult;
2657 break;
2658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 break;
2662 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 for (; *p; ++p, ++i)
2664 PyUnicode_WRITE(kind, data, i, *p);
2665 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 goto end;
2667 }
Victor Stinner1205f272010-09-11 00:54:47 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 else {
2670 assert(i < PyUnicode_GET_LENGTH(string));
2671 PyUnicode_WRITE(kind, data, i++, *f);
2672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 if (callresults)
2678 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 if (numberresults)
2680 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002681 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 if (callresults) {
2684 PyObject **callresult2 = callresults;
2685 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002686 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 ++callresult2;
2688 }
2689 PyObject_Free(callresults);
2690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 if (numberresults)
2692 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002694}
2695
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696PyObject *
2697PyUnicode_FromFormat(const char *format, ...)
2698{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 PyObject* ret;
2700 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701
2702#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002706#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 ret = PyUnicode_FromFormatV(format, vargs);
2708 va_end(vargs);
2709 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710}
2711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712#ifdef HAVE_WCHAR_H
2713
Victor Stinner5593d8a2010-10-02 11:11:27 +00002714/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2715 convert a Unicode object to a wide character string.
2716
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718 character) required to convert the unicode object. Ignore size argument.
2719
Victor Stinnerd88d9832011-09-06 02:00:05 +02002720 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002722 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002724unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002725 wchar_t *w,
2726 Py_ssize_t size)
2727{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 const wchar_t *wstr;
2730
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 if (wstr == NULL)
2733 return -1;
2734
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 if (size > res)
2737 size = res + 1;
2738 else
2739 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 return res;
2742 }
2743 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002745}
2746
2747Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002748PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002749 wchar_t *w,
2750 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
2752 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 PyErr_BadInternalCall();
2754 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002756 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
Victor Stinner137c34c2010-09-29 10:25:54 +00002759wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002760PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002761 Py_ssize_t *size)
2762{
2763 wchar_t* buffer;
2764 Py_ssize_t buflen;
2765
2766 if (unicode == NULL) {
2767 PyErr_BadInternalCall();
2768 return NULL;
2769 }
2770
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002771 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 if (buflen == -1)
2773 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002775 PyErr_NoMemory();
2776 return NULL;
2777 }
2778
Victor Stinner137c34c2010-09-29 10:25:54 +00002779 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2780 if (buffer == NULL) {
2781 PyErr_NoMemory();
2782 return NULL;
2783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (buflen == -1)
2786 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 if (size != NULL)
2788 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789 return buffer;
2790}
2791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793
Alexander Belopolsky40018472011-02-26 01:02:56 +00002794PyObject *
2795PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002798 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 PyErr_SetString(PyExc_ValueError,
2800 "chr() arg not in range(0x110000)");
2801 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 if (ordinal < 256)
2805 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 v = PyUnicode_New(1, ordinal);
2808 if (v == NULL)
2809 return NULL;
2810 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002811 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813}
2814
Alexander Belopolsky40018472011-02-26 01:02:56 +00002815PyObject *
2816PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002818 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002821 if (PyUnicode_READY(obj))
2822 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 Py_INCREF(obj);
2824 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
2826 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 /* For a Unicode subtype that's not a Unicode object,
2828 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002829 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002831 PyErr_Format(PyExc_TypeError,
2832 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002833 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002834 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002835}
2836
Alexander Belopolsky40018472011-02-26 01:02:56 +00002837PyObject *
2838PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002839 const char *encoding,
2840 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002841{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002842 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002843 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyErr_BadInternalCall();
2847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 /* Decoding bytes objects is the most common case and should be fast */
2851 if (PyBytes_Check(obj)) {
2852 if (PyBytes_GET_SIZE(obj) == 0) {
2853 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002854 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 }
2856 else {
2857 v = PyUnicode_Decode(
2858 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2859 encoding, errors);
2860 }
2861 return v;
2862 }
2863
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 PyErr_SetString(PyExc_TypeError,
2866 "decoding str is not supported");
2867 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002868 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002870 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2871 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2872 PyErr_Format(PyExc_TypeError,
2873 "coercing to str: need bytes, bytearray "
2874 "or buffer-like object, %.80s found",
2875 Py_TYPE(obj)->tp_name);
2876 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002880 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002881 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 }
Tim Petersced69f82003-09-16 20:30:58 +00002883 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002887 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888}
2889
Victor Stinner600d3be2010-06-10 12:00:55 +00002890/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002891 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2892 1 on success. */
2893static int
2894normalize_encoding(const char *encoding,
2895 char *lower,
2896 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002898 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002899 char *l;
2900 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002902 if (encoding == NULL) {
2903 strcpy(lower, "utf-8");
2904 return 1;
2905 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002906 e = encoding;
2907 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002909 while (*e) {
2910 if (l == l_end)
2911 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002912 if (Py_ISUPPER(*e)) {
2913 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002914 }
2915 else if (*e == '_') {
2916 *l++ = '-';
2917 e++;
2918 }
2919 else {
2920 *l++ = *e++;
2921 }
2922 }
2923 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002924 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925}
2926
Alexander Belopolsky40018472011-02-26 01:02:56 +00002927PyObject *
2928PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002929 Py_ssize_t size,
2930 const char *encoding,
2931 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002932{
2933 PyObject *buffer = NULL, *unicode;
2934 Py_buffer info;
2935 char lower[11]; /* Enough for any encoding shortcut */
2936
Fred Drakee4315f52000-05-09 19:53:39 +00002937 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002938 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002939 if ((strcmp(lower, "utf-8") == 0) ||
2940 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002941 return PyUnicode_DecodeUTF8(s, size, errors);
2942 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002943 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002944 (strcmp(lower, "iso-8859-1") == 0))
2945 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002946#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002947 else if (strcmp(lower, "mbcs") == 0)
2948 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002949#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002950 else if (strcmp(lower, "ascii") == 0)
2951 return PyUnicode_DecodeASCII(s, size, errors);
2952 else if (strcmp(lower, "utf-16") == 0)
2953 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2954 else if (strcmp(lower, "utf-32") == 0)
2955 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957
2958 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002959 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002960 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002961 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002962 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 if (buffer == NULL)
2964 goto onError;
2965 unicode = PyCodec_Decode(buffer, encoding, errors);
2966 if (unicode == NULL)
2967 goto onError;
2968 if (!PyUnicode_Check(unicode)) {
2969 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002970 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002971 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 Py_DECREF(unicode);
2973 goto onError;
2974 }
2975 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002976 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002977
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_XDECREF(buffer);
2980 return NULL;
2981}
2982
Alexander Belopolsky40018472011-02-26 01:02:56 +00002983PyObject *
2984PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002985 const char *encoding,
2986 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002987{
2988 PyObject *v;
2989
2990 if (!PyUnicode_Check(unicode)) {
2991 PyErr_BadArgument();
2992 goto onError;
2993 }
2994
2995 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002997
2998 /* Decode via the codec registry */
2999 v = PyCodec_Decode(unicode, encoding, errors);
3000 if (v == NULL)
3001 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003002 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003003
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003005 return NULL;
3006}
3007
Alexander Belopolsky40018472011-02-26 01:02:56 +00003008PyObject *
3009PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003010 const char *encoding,
3011 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012{
3013 PyObject *v;
3014
3015 if (!PyUnicode_Check(unicode)) {
3016 PyErr_BadArgument();
3017 goto onError;
3018 }
3019
3020 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003022
3023 /* Decode via the codec registry */
3024 v = PyCodec_Decode(unicode, encoding, errors);
3025 if (v == NULL)
3026 goto onError;
3027 if (!PyUnicode_Check(v)) {
3028 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003029 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030 Py_TYPE(v)->tp_name);
3031 Py_DECREF(v);
3032 goto onError;
3033 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 Py_ssize_t size,
3043 const char *encoding,
3044 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
3046 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 unicode = PyUnicode_FromUnicode(s, size);
3049 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3052 Py_DECREF(unicode);
3053 return v;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Encode via the codec registry */
3072 v = PyCodec_Encode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
3075 return v;
3076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Victor Stinnerad158722010-10-27 00:25:46 +00003081PyObject *
3082PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003083{
Victor Stinner99b95382011-07-04 14:23:54 +02003084#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003085 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003086#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003088#else
Victor Stinner793b5312011-04-27 00:24:21 +02003089 PyInterpreterState *interp = PyThreadState_GET()->interp;
3090 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3091 cannot use it to encode and decode filenames before it is loaded. Load
3092 the Python codec requires to encode at least its own filename. Use the C
3093 version of the locale codec until the codec registry is initialized and
3094 the Python codec is loaded.
3095
3096 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3097 cannot only rely on it: check also interp->fscodec_initialized for
3098 subinterpreters. */
3099 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003100 return PyUnicode_AsEncodedString(unicode,
3101 Py_FileSystemDefaultEncoding,
3102 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003103 }
3104 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105 /* locale encoding with surrogateescape */
3106 wchar_t *wchar;
3107 char *bytes;
3108 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003109 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003110
3111 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3112 if (wchar == NULL)
3113 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003114 bytes = _Py_wchar2char(wchar, &error_pos);
3115 if (bytes == NULL) {
3116 if (error_pos != (size_t)-1) {
3117 char *errmsg = strerror(errno);
3118 PyObject *exc = NULL;
3119 if (errmsg == NULL)
3120 errmsg = "Py_wchar2char() failed";
3121 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003122 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003123 error_pos, error_pos+1,
3124 errmsg);
3125 Py_XDECREF(exc);
3126 }
3127 else
3128 PyErr_NoMemory();
3129 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003130 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003131 }
3132 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003133
3134 bytes_obj = PyBytes_FromString(bytes);
3135 PyMem_Free(bytes);
3136 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003137 }
Victor Stinnerad158722010-10-27 00:25:46 +00003138#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003139}
3140
Alexander Belopolsky40018472011-02-26 01:02:56 +00003141PyObject *
3142PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003143 const char *encoding,
3144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145{
3146 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003147 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003148
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 if (!PyUnicode_Check(unicode)) {
3150 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 }
Fred Drakee4315f52000-05-09 19:53:39 +00003153
Fred Drakee4315f52000-05-09 19:53:39 +00003154 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003155 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003156 if ((strcmp(lower, "utf-8") == 0) ||
3157 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003158 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003159 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003161 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003163 }
Victor Stinner37296e82010-06-10 13:36:23 +00003164 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003165 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003166 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003168#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003169 else if (strcmp(lower, "mbcs") == 0)
3170 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003171#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003172 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
3176 /* Encode via the codec registry */
3177 v = PyCodec_Encode(unicode, encoding, errors);
3178 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003179 return NULL;
3180
3181 /* The normal path */
3182 if (PyBytes_Check(v))
3183 return v;
3184
3185 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003187 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003188 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003189
3190 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3191 "encoder %s returned bytearray instead of bytes",
3192 encoding);
3193 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003194 Py_DECREF(v);
3195 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003198 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3199 Py_DECREF(v);
3200 return b;
3201 }
3202
3203 PyErr_Format(PyExc_TypeError,
3204 "encoder did not return a bytes object (type=%.400s)",
3205 Py_TYPE(v)->tp_name);
3206 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 return NULL;
3208}
3209
Alexander Belopolsky40018472011-02-26 01:02:56 +00003210PyObject *
3211PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003212 const char *encoding,
3213 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003214{
3215 PyObject *v;
3216
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_BadArgument();
3219 goto onError;
3220 }
3221
3222 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003224
3225 /* Encode via the codec registry */
3226 v = PyCodec_Encode(unicode, encoding, errors);
3227 if (v == NULL)
3228 goto onError;
3229 if (!PyUnicode_Check(v)) {
3230 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003231 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003232 Py_TYPE(v)->tp_name);
3233 Py_DECREF(v);
3234 goto onError;
3235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003237
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 return NULL;
3240}
3241
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003243PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003244 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003245 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3246}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247
Christian Heimes5894ba72007-11-04 11:43:14 +00003248PyObject*
3249PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3250{
Victor Stinner99b95382011-07-04 14:23:54 +02003251#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003252 return PyUnicode_DecodeMBCS(s, size, NULL);
3253#elif defined(__APPLE__)
3254 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3255#else
Victor Stinner793b5312011-04-27 00:24:21 +02003256 PyInterpreterState *interp = PyThreadState_GET()->interp;
3257 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3258 cannot use it to encode and decode filenames before it is loaded. Load
3259 the Python codec requires to encode at least its own filename. Use the C
3260 version of the locale codec until the codec registry is initialized and
3261 the Python codec is loaded.
3262
3263 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3264 cannot only rely on it: check also interp->fscodec_initialized for
3265 subinterpreters. */
3266 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003267 return PyUnicode_Decode(s, size,
3268 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003269 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003270 }
3271 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003272 /* locale encoding with surrogateescape */
3273 wchar_t *wchar;
3274 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003275 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003276
3277 if (s[size] != '\0' || size != strlen(s)) {
3278 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3279 return NULL;
3280 }
3281
Victor Stinner168e1172010-10-16 23:16:16 +00003282 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003284 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003285
Victor Stinner168e1172010-10-16 23:16:16 +00003286 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287 PyMem_Free(wchar);
3288 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003289 }
Victor Stinnerad158722010-10-27 00:25:46 +00003290#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003291}
3292
Martin v. Löwis011e8422009-05-05 04:43:17 +00003293
3294int
3295PyUnicode_FSConverter(PyObject* arg, void* addr)
3296{
3297 PyObject *output = NULL;
3298 Py_ssize_t size;
3299 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003300 if (arg == NULL) {
3301 Py_DECREF(*(PyObject**)addr);
3302 return 1;
3303 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003304 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003305 output = arg;
3306 Py_INCREF(output);
3307 }
3308 else {
3309 arg = PyUnicode_FromObject(arg);
3310 if (!arg)
3311 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003312 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003313 Py_DECREF(arg);
3314 if (!output)
3315 return 0;
3316 if (!PyBytes_Check(output)) {
3317 Py_DECREF(output);
3318 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3319 return 0;
3320 }
3321 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003322 size = PyBytes_GET_SIZE(output);
3323 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003324 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003325 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003326 Py_DECREF(output);
3327 return 0;
3328 }
3329 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003330 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003331}
3332
3333
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003334int
3335PyUnicode_FSDecoder(PyObject* arg, void* addr)
3336{
3337 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003338 if (arg == NULL) {
3339 Py_DECREF(*(PyObject**)addr);
3340 return 1;
3341 }
3342 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 if (PyUnicode_READY(arg))
3344 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345 output = arg;
3346 Py_INCREF(output);
3347 }
3348 else {
3349 arg = PyBytes_FromObject(arg);
3350 if (!arg)
3351 return 0;
3352 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3353 PyBytes_GET_SIZE(arg));
3354 Py_DECREF(arg);
3355 if (!output)
3356 return 0;
3357 if (!PyUnicode_Check(output)) {
3358 Py_DECREF(output);
3359 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3360 return 0;
3361 }
3362 }
Victor Stinner065836e2011-10-27 01:56:33 +02003363 if (PyUnicode_READY(output) < 0) {
3364 Py_DECREF(output);
3365 return 0;
3366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003368 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003369 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3370 Py_DECREF(output);
3371 return 0;
3372 }
3373 *(PyObject**)addr = output;
3374 return Py_CLEANUP_SUPPORTED;
3375}
3376
3377
Martin v. Löwis5b222132007-06-10 09:51:05 +00003378char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003380{
Christian Heimesf3863112007-11-22 07:46:41 +00003381 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003382
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003383 if (!PyUnicode_Check(unicode)) {
3384 PyErr_BadArgument();
3385 return NULL;
3386 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003388 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003390 if (PyUnicode_UTF8(unicode) == NULL) {
3391 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3393 if (bytes == NULL)
3394 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3396 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397 Py_DECREF(bytes);
3398 return NULL;
3399 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003400 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3401 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3402 PyBytes_AS_STRING(bytes),
3403 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003404 Py_DECREF(bytes);
3405 }
3406
3407 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003408 *psize = PyUnicode_UTF8_LENGTH(unicode);
3409 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003410}
3411
3412char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003413PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3416}
3417
3418#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003419static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420#endif
3421
3422
3423Py_UNICODE *
3424PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 const unsigned char *one_byte;
3427#if SIZEOF_WCHAR_T == 4
3428 const Py_UCS2 *two_bytes;
3429#else
3430 const Py_UCS4 *four_bytes;
3431 const Py_UCS4 *ucs4_end;
3432 Py_ssize_t num_surrogates;
3433#endif
3434 wchar_t *w;
3435 wchar_t *wchar_end;
3436
3437 if (!PyUnicode_Check(unicode)) {
3438 PyErr_BadArgument();
3439 return NULL;
3440 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003441 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003442 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 assert(_PyUnicode_KIND(unicode) != 0);
3444 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003445
3446#ifdef Py_DEBUG
3447 ++unicode_as_unicode_calls;
3448#endif
3449
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3453 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003454 num_surrogates = 0;
3455
3456 for (; four_bytes < ucs4_end; ++four_bytes) {
3457 if (*four_bytes > 0xFFFF)
3458 ++num_surrogates;
3459 }
3460
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3462 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3463 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003464 PyErr_NoMemory();
3465 return NULL;
3466 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003467 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003468
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 w = _PyUnicode_WSTR(unicode);
3470 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3471 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3473 if (*four_bytes > 0xFFFF) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01003474 assert(*four_bytes <= 0x10FFFF);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 /* encode surrogate pair in this case */
3476 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3477 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3478 }
3479 else
3480 *w = *four_bytes;
3481
3482 if (w > wchar_end) {
3483 assert(0 && "Miscalculated string end");
3484 }
3485 }
3486 *w = 0;
3487#else
3488 /* sizeof(wchar_t) == 4 */
3489 Py_FatalError("Impossible unicode object state, wstr and str "
3490 "should share memory already.");
3491 return NULL;
3492#endif
3493 }
3494 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003495 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3496 (_PyUnicode_LENGTH(unicode) + 1));
3497 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498 PyErr_NoMemory();
3499 return NULL;
3500 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3502 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3503 w = _PyUnicode_WSTR(unicode);
3504 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003506 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3507 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 for (; w < wchar_end; ++one_byte, ++w)
3509 *w = *one_byte;
3510 /* null-terminate the wstr */
3511 *w = 0;
3512 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003513 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003515 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 for (; w < wchar_end; ++two_bytes, ++w)
3517 *w = *two_bytes;
3518 /* null-terminate the wstr */
3519 *w = 0;
3520#else
3521 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003522 PyObject_FREE(_PyUnicode_WSTR(unicode));
3523 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524 Py_FatalError("Impossible unicode object state, wstr "
3525 "and str should share memory already.");
3526 return NULL;
3527#endif
3528 }
3529 else {
3530 assert(0 && "This should never happen.");
3531 }
3532 }
3533 }
3534 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003535 *size = PyUnicode_WSTR_LENGTH(unicode);
3536 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003537}
3538
Alexander Belopolsky40018472011-02-26 01:02:56 +00003539Py_UNICODE *
3540PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003542 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543}
3544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545
Alexander Belopolsky40018472011-02-26 01:02:56 +00003546Py_ssize_t
3547PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548{
3549 if (!PyUnicode_Check(unicode)) {
3550 PyErr_BadArgument();
3551 goto onError;
3552 }
3553 return PyUnicode_GET_SIZE(unicode);
3554
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return -1;
3557}
3558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559Py_ssize_t
3560PyUnicode_GetLength(PyObject *unicode)
3561{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003562 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 PyErr_BadArgument();
3564 return -1;
3565 }
3566
3567 return PyUnicode_GET_LENGTH(unicode);
3568}
3569
3570Py_UCS4
3571PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3572{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003573 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3574 PyErr_BadArgument();
3575 return (Py_UCS4)-1;
3576 }
3577 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3578 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 return (Py_UCS4)-1;
3580 }
3581 return PyUnicode_READ_CHAR(unicode, index);
3582}
3583
3584int
3585PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3586{
3587 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003588 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return -1;
3590 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003591 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3592 PyErr_SetString(PyExc_IndexError, "string index out of range");
3593 return -1;
3594 }
3595 if (_PyUnicode_Dirty(unicode))
3596 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003597 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3598 index, ch);
3599 return 0;
3600}
3601
Alexander Belopolsky40018472011-02-26 01:02:56 +00003602const char *
3603PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003604{
Victor Stinner42cb4622010-09-01 19:39:01 +00003605 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003606}
3607
Victor Stinner554f3f02010-06-16 23:33:54 +00003608/* create or adjust a UnicodeDecodeError */
3609static void
3610make_decode_exception(PyObject **exceptionObject,
3611 const char *encoding,
3612 const char *input, Py_ssize_t length,
3613 Py_ssize_t startpos, Py_ssize_t endpos,
3614 const char *reason)
3615{
3616 if (*exceptionObject == NULL) {
3617 *exceptionObject = PyUnicodeDecodeError_Create(
3618 encoding, input, length, startpos, endpos, reason);
3619 }
3620 else {
3621 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3622 goto onError;
3623 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3624 goto onError;
3625 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3626 goto onError;
3627 }
3628 return;
3629
3630onError:
3631 Py_DECREF(*exceptionObject);
3632 *exceptionObject = NULL;
3633}
3634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635/* error handling callback helper:
3636 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003637 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 and adjust various state variables.
3639 return 0 on success, -1 on error
3640*/
3641
Alexander Belopolsky40018472011-02-26 01:02:56 +00003642static int
3643unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003644 const char *encoding, const char *reason,
3645 const char **input, const char **inend, Py_ssize_t *startinpos,
3646 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003647 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003649 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650
3651 PyObject *restuple = NULL;
3652 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003653 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003654 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003655 Py_ssize_t requiredsize;
3656 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003657 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 int res = -1;
3659
Victor Stinner596a6c42011-11-09 00:02:18 +01003660 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3661 outsize = PyUnicode_GET_LENGTH(*output);
3662 else
3663 outsize = _PyUnicode_WSTR_LENGTH(*output);
3664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 *errorHandler = PyCodec_LookupError(errors);
3667 if (*errorHandler == NULL)
3668 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 }
3670
Victor Stinner554f3f02010-06-16 23:33:54 +00003671 make_decode_exception(exceptionObject,
3672 encoding,
3673 *input, *inend - *input,
3674 *startinpos, *endinpos,
3675 reason);
3676 if (*exceptionObject == NULL)
3677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678
3679 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3680 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003683 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 }
3686 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003688 if (PyUnicode_READY(repunicode) < 0)
3689 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003690
3691 /* Copy back the bytes variables, which might have been modified by the
3692 callback */
3693 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3694 if (!inputobj)
3695 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003696 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003699 *input = PyBytes_AS_STRING(inputobj);
3700 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003701 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003702 /* we can DECREF safely, as the exception has another reference,
3703 so the object won't go away. */
3704 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003708 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3710 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712
Victor Stinner596a6c42011-11-09 00:02:18 +01003713 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3714 /* need more space? (at least enough for what we
3715 have+the replacement+the rest of the string (starting
3716 at the new input position), so we won't have to check space
3717 when there are no errors in the rest of the string) */
3718 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3719 requiredsize = *outpos + replen + insize-newpos;
3720 if (requiredsize > outsize) {
3721 if (requiredsize<2*outsize)
3722 requiredsize = 2*outsize;
3723 if (unicode_resize(output, requiredsize) < 0)
3724 goto onError;
3725 }
3726 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003728 copy_characters(*output, *outpos, repunicode, 0, replen);
3729 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003731 else {
3732 wchar_t *repwstr;
3733 Py_ssize_t repwlen;
3734 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3735 if (repwstr == NULL)
3736 goto onError;
3737 /* need more space? (at least enough for what we
3738 have+the replacement+the rest of the string (starting
3739 at the new input position), so we won't have to check space
3740 when there are no errors in the rest of the string) */
3741 requiredsize = *outpos + repwlen + insize-newpos;
3742 if (requiredsize > outsize) {
3743 if (requiredsize < 2*outsize)
3744 requiredsize = 2*outsize;
3745 if (unicode_resize(output, requiredsize) < 0)
3746 goto onError;
3747 }
3748 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3749 *outpos += repwlen;
3750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003752 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 /* we made it! */
3755 res = 0;
3756
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 Py_XDECREF(restuple);
3759 return res;
3760}
3761
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003762/* --- UTF-7 Codec -------------------------------------------------------- */
3763
Antoine Pitrou244651a2009-05-04 18:56:13 +00003764/* See RFC2152 for details. We encode conservatively and decode liberally. */
3765
3766/* Three simple macros defining base-64. */
3767
3768/* Is c a base-64 character? */
3769
3770#define IS_BASE64(c) \
3771 (((c) >= 'A' && (c) <= 'Z') || \
3772 ((c) >= 'a' && (c) <= 'z') || \
3773 ((c) >= '0' && (c) <= '9') || \
3774 (c) == '+' || (c) == '/')
3775
3776/* given that c is a base-64 character, what is its base-64 value? */
3777
3778#define FROM_BASE64(c) \
3779 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3780 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3781 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3782 (c) == '+' ? 62 : 63)
3783
3784/* What is the base-64 character of the bottom 6 bits of n? */
3785
3786#define TO_BASE64(n) \
3787 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3788
3789/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3790 * decoded as itself. We are permissive on decoding; the only ASCII
3791 * byte not decoding to itself is the + which begins a base64
3792 * string. */
3793
3794#define DECODE_DIRECT(c) \
3795 ((c) <= 127 && (c) != '+')
3796
3797/* The UTF-7 encoder treats ASCII characters differently according to
3798 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3799 * the above). See RFC2152. This array identifies these different
3800 * sets:
3801 * 0 : "Set D"
3802 * alphanumeric and '(),-./:?
3803 * 1 : "Set O"
3804 * !"#$%&*;<=>@[]^_`{|}
3805 * 2 : "whitespace"
3806 * ht nl cr sp
3807 * 3 : special (must be base64 encoded)
3808 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3809 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003810
Tim Petersced69f82003-09-16 20:30:58 +00003811static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003812char utf7_category[128] = {
3813/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3814 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3815/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3816 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3817/* sp ! " # $ % & ' ( ) * + , - . / */
3818 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3819/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3821/* @ A B C D E F G H I J K L M N O */
3822 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3823/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3825/* ` a b c d e f g h i j k l m n o */
3826 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3827/* p q r s t u v w x y z { | } ~ del */
3828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003829};
3830
Antoine Pitrou244651a2009-05-04 18:56:13 +00003831/* ENCODE_DIRECT: this character should be encoded as itself. The
3832 * answer depends on whether we are encoding set O as itself, and also
3833 * on whether we are encoding whitespace as itself. RFC2152 makes it
3834 * clear that the answers to these questions vary between
3835 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003836
Antoine Pitrou244651a2009-05-04 18:56:13 +00003837#define ENCODE_DIRECT(c, directO, directWS) \
3838 ((c) < 128 && (c) > 0 && \
3839 ((utf7_category[(c)] == 0) || \
3840 (directWS && (utf7_category[(c)] == 2)) || \
3841 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842
Alexander Belopolsky40018472011-02-26 01:02:56 +00003843PyObject *
3844PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003845 Py_ssize_t size,
3846 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003847{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003848 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3849}
3850
Antoine Pitrou244651a2009-05-04 18:56:13 +00003851/* The decoder. The only state we preserve is our read position,
3852 * i.e. how many characters we have consumed. So if we end in the
3853 * middle of a shift sequence we have to back off the read position
3854 * and the output to the beginning of the sequence, otherwise we lose
3855 * all the shift state (seen bits, number of bits seen, high
3856 * surrogate). */
3857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858PyObject *
3859PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003860 Py_ssize_t size,
3861 const char *errors,
3862 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003865 Py_ssize_t startinpos;
3866 Py_ssize_t endinpos;
3867 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003868 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003869 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 const char *errmsg = "";
3871 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003872 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003873 unsigned int base64bits = 0;
3874 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003875 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 PyObject *errorHandler = NULL;
3877 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 /* Start off assuming it's all ASCII. Widen later as necessary. */
3880 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 if (!unicode)
3882 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003883 if (size == 0) {
3884 if (consumed)
3885 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003886 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 e = s + size;
3891
3892 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003893 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003895 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 if (inShift) { /* in a base-64 section */
3898 if (IS_BASE64(ch)) { /* consume a base-64 character */
3899 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3900 base64bits += 6;
3901 s++;
3902 if (base64bits >= 16) {
3903 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003904 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 base64bits -= 16;
3906 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3907 if (surrogate) {
3908 /* expecting a second surrogate */
3909 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003910 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3911 | (outCh & 0x3FF)) + 0x10000;
3912 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3913 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003915 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003918 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3919 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 }
3922 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003923 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 /* first surrogate */
3925 surrogate = outCh;
3926 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003928 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 }
3931 }
3932 }
3933 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 inShift = 0;
3935 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003937 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3938 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003939 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003940 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 if (base64bits > 0) { /* left-over bits */
3942 if (base64bits >= 6) {
3943 /* We've seen at least one base-64 character */
3944 errmsg = "partial character in shift sequence";
3945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 else {
3948 /* Some bits remain; they should be zero */
3949 if (base64buffer != 0) {
3950 errmsg = "non-zero padding bits in shift sequence";
3951 goto utf7Error;
3952 }
3953 }
3954 }
3955 if (ch != '-') {
3956 /* '-' is absorbed; other terminating
3957 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003958 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3959 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961 }
3962 }
3963 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 s++; /* consume '+' */
3966 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3969 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 }
3971 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003972 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003974 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 }
3976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3979 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 s++;
3981 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003982 else {
3983 startinpos = s-starts;
3984 s++;
3985 errmsg = "unexpected special character";
3986 goto utf7Error;
3987 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003988 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003989utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 endinpos = s-starts;
3991 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 errors, &errorHandler,
3993 "utf7", errmsg,
3994 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003995 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003997 }
3998
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999 /* end of string */
4000
4001 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4002 /* if we're in an inconsistent state, that's an error */
4003 if (surrogate ||
4004 (base64bits >= 6) ||
4005 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 endinpos = size;
4007 if (unicode_decode_call_errorhandler(
4008 errors, &errorHandler,
4009 "utf7", "unterminated shift sequence",
4010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004011 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 goto onError;
4013 if (s < e)
4014 goto restart;
4015 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004017
4018 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004019 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004020 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004022 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 }
4024 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004025 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004029 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030 goto onError;
4031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 Py_XDECREF(errorHandler);
4033 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004034 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_XDECREF(errorHandler);
4038 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004039 Py_DECREF(unicode);
4040 return NULL;
4041}
4042
4043
Alexander Belopolsky40018472011-02-26 01:02:56 +00004044PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004045_PyUnicode_EncodeUTF7(PyObject *str,
4046 int base64SetO,
4047 int base64WhiteSpace,
4048 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004050 int kind;
4051 void *data;
4052 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004053 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004054 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 unsigned int base64bits = 0;
4058 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059 char * out;
4060 char * start;
4061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 if (PyUnicode_READY(str) < 0)
4063 return NULL;
4064 kind = PyUnicode_KIND(str);
4065 data = PyUnicode_DATA(str);
4066 len = PyUnicode_GET_LENGTH(str);
4067
4068 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004070
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004071 /* It might be possible to tighten this worst case */
4072 allocated = 8 * len;
4073 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004074 return PyErr_NoMemory();
4075
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077 if (v == NULL)
4078 return NULL;
4079
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004080 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004082 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084 if (inShift) {
4085 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4086 /* shifting out */
4087 if (base64bits) { /* output remaining bits */
4088 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4089 base64buffer = 0;
4090 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091 }
4092 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093 /* Characters not in the BASE64 set implicitly unshift the sequence
4094 so no '-' is required, except if the character is itself a '-' */
4095 if (IS_BASE64(ch) || ch == '-') {
4096 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004097 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 *out++ = (char) ch;
4099 }
4100 else {
4101 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004102 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104 else { /* not in a shift sequence */
4105 if (ch == '+') {
4106 *out++ = '+';
4107 *out++ = '-';
4108 }
4109 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4110 *out++ = (char) ch;
4111 }
4112 else {
4113 *out++ = '+';
4114 inShift = 1;
4115 goto encode_char;
4116 }
4117 }
4118 continue;
4119encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004120 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004121 assert(ch <= 0x10FFFF);
4122
Antoine Pitrou244651a2009-05-04 18:56:13 +00004123 /* code first surrogate */
4124 base64bits += 16;
4125 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4126 while (base64bits >= 6) {
4127 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4128 base64bits -= 6;
4129 }
4130 /* prepare second surrogate */
4131 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4132 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133 base64bits += 16;
4134 base64buffer = (base64buffer << 16) | ch;
4135 while (base64bits >= 6) {
4136 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4137 base64bits -= 6;
4138 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004139 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004140 if (base64bits)
4141 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4142 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004144 if (_PyBytes_Resize(&v, out - start) < 0)
4145 return NULL;
4146 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004148PyObject *
4149PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4150 Py_ssize_t size,
4151 int base64SetO,
4152 int base64WhiteSpace,
4153 const char *errors)
4154{
4155 PyObject *result;
4156 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4157 if (tmp == NULL)
4158 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004159 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004160 base64WhiteSpace, errors);
4161 Py_DECREF(tmp);
4162 return result;
4163}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165#undef IS_BASE64
4166#undef FROM_BASE64
4167#undef TO_BASE64
4168#undef DECODE_DIRECT
4169#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171/* --- UTF-8 Codec -------------------------------------------------------- */
4172
Tim Petersced69f82003-09-16 20:30:58 +00004173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004175 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4176 illegal prefix. See RFC 3629 for details */
4177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4189 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4190 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4191 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4192 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193};
4194
Alexander Belopolsky40018472011-02-26 01:02:56 +00004195PyObject *
4196PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004197 Py_ssize_t size,
4198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199{
Walter Dörwald69652032004-09-07 20:24:22 +00004200 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4201}
4202
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004203#include "stringlib/ucs1lib.h"
4204#include "stringlib/codecs.h"
4205#include "stringlib/undef.h"
4206
4207#include "stringlib/ucs2lib.h"
4208#include "stringlib/codecs.h"
4209#include "stringlib/undef.h"
4210
4211#include "stringlib/ucs4lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
Antoine Pitrouab868312009-01-10 15:40:25 +00004215/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4216#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4217
4218/* Mask to quickly check whether a C 'long' contains a
4219 non-ASCII, UTF8-encoded char. */
4220#if (SIZEOF_LONG == 8)
4221# define ASCII_CHAR_MASK 0x8080808080808080L
4222#elif (SIZEOF_LONG == 4)
4223# define ASCII_CHAR_MASK 0x80808080L
4224#else
4225# error C 'long' size should be either 4 or 8!
4226#endif
4227
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004228/* Scans a UTF-8 string and returns the maximum character to be expected
4229 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004231 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 */
4234static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004235utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4236 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 const unsigned char *p = (const unsigned char *)s;
4240 const unsigned char *end = p + string_size;
4241 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243 assert(unicode_size != NULL);
4244
4245 /* By having a cascade of independent loops which fallback onto each
4246 other, we minimize the amount of work done in the average loop
4247 iteration, and we also maximize the CPU's ability to predict
4248 branches correctly (because a given condition will have always the
4249 same boolean outcome except perhaps in the last iteration of the
4250 corresponding loop).
4251 In the general case this brings us rather close to decoding
4252 performance pre-PEP 393, despite the two-pass decoding.
4253
4254 Note that the pure ASCII loop is not duplicated once a non-ASCII
4255 character has been encountered. It is actually a pessimization (by
4256 a significant factor) to use this loop on text with many non-ASCII
4257 characters, and it is important to avoid bad performance on valid
4258 utf-8 data (invalid utf-8 being a different can of worms).
4259 */
4260
4261 /* ASCII */
4262 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* Only check value if it's not a ASCII char... */
4264 if (*p < 0x80) {
4265 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4266 an explanation. */
4267 if (!((size_t) p & LONG_PTR_MASK)) {
4268 /* Help register allocation */
4269 register const unsigned char *_p = p;
4270 while (_p < aligned_end) {
4271 unsigned long value = *(unsigned long *) _p;
4272 if (value & ASCII_CHAR_MASK)
4273 break;
4274 _p += SIZEOF_LONG;
4275 char_count += SIZEOF_LONG;
4276 }
4277 p = _p;
4278 if (p == end)
4279 break;
4280 }
4281 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004282 if (*p < 0x80)
4283 ++char_count;
4284 else
4285 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 *unicode_size = char_count;
4288 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290_ucs1loop:
4291 for (; p < end; ++p) {
4292 if (*p < 0xc4)
4293 char_count += ((*p & 0xc0) != 0x80);
4294 else
4295 goto _ucs2loop;
4296 }
4297 *unicode_size = char_count;
4298 return 255;
4299
4300_ucs2loop:
4301 for (; p < end; ++p) {
4302 if (*p < 0xf0)
4303 char_count += ((*p & 0xc0) != 0x80);
4304 else
4305 goto _ucs4loop;
4306 }
4307 *unicode_size = char_count;
4308 return 65535;
4309
4310_ucs4loop:
4311 for (; p < end; ++p) {
4312 char_count += ((*p & 0xc0) != 0x80);
4313 }
4314 *unicode_size = char_count;
4315 return 65537;
4316}
4317
4318/* Called when we encountered some error that wasn't detected in the original
4319 scan, e.g. an encoded surrogate character. The original maxchar computation
4320 may have been incorrect, so redo it. */
4321static int
4322refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4323{
4324 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004325 Py_ssize_t k;
4326 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004327 for (k = 0, maxchar = 0; k < n; k++)
4328 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4329 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4330 if (tmp == NULL)
4331 return -1;
4332 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4333 Py_DECREF(*unicode);
4334 *unicode = tmp;
4335 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336}
4337
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4339 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4340 onError. Potential resizing overallocates, so the result needs to shrink
4341 at the end.
4342*/
4343#define WRITE_MAYBE_FAIL(index, value) \
4344 do { \
4345 if (has_errors) { \
4346 Py_ssize_t pos = index; \
4347 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4348 unicode_resize(&unicode, pos + pos/8) < 0) \
4349 goto onError; \
4350 if (unicode_putchar(&unicode, &pos, value) < 0) \
4351 goto onError; \
4352 } \
4353 else \
4354 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004355 } while (0)
4356
Alexander Belopolsky40018472011-02-26 01:02:56 +00004357PyObject *
4358PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 Py_ssize_t size,
4360 const char *errors,
4361 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004365 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004366 Py_ssize_t startinpos;
4367 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004368 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004369 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004370 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 PyObject *errorHandler = NULL;
4372 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 Py_UCS4 maxchar = 0;
4374 Py_ssize_t unicode_size;
4375 Py_ssize_t i;
4376 int kind;
4377 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004378 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379
Walter Dörwald69652032004-09-07 20:24:22 +00004380 if (size == 0) {
4381 if (consumed)
4382 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004384 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004385 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004386 /* When the string is ASCII only, just use memcpy and return.
4387 unicode_size may be != size if there is an incomplete UTF-8
4388 sequence at the end of the ASCII block. */
4389 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004390 if (consumed)
4391 *consumed = size;
4392
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004393 if (size == 1)
4394 return get_latin1_char((unsigned char)s[0]);
4395
4396 unicode = PyUnicode_New(unicode_size, maxchar);
4397 if (!unicode)
4398 return NULL;
4399 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4400 assert(_PyUnicode_CheckConsistency(unicode, 1));
4401 return unicode;
4402 }
4403
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004404 /* In case of errors, maxchar and size computation might be incorrect;
4405 code below refits and resizes as necessary. */
4406 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 if (!unicode)
4408 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 kind = PyUnicode_KIND(unicode);
4410 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004415 switch (kind) {
4416 case PyUnicode_1BYTE_KIND:
4417 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4418 break;
4419 case PyUnicode_2BYTE_KIND:
4420 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4421 break;
4422 case PyUnicode_4BYTE_KIND:
4423 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4424 break;
4425 }
4426 if (!has_errors) {
4427 /* Ensure the unicode size calculation was correct */
4428 assert(i == unicode_size);
4429 assert(s == e);
4430 if (consumed)
4431 *consumed = s-starts;
4432 return unicode;
4433 }
4434 /* Fall through to the generic decoding loop for the rest of
4435 the string */
4436 if (refit_partial_string(&unicode, kind, data, i) < 0)
4437 goto onError;
4438
Antoine Pitrouab868312009-01-10 15:40:25 +00004439 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443
4444 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004445 /* Fast path for runs of ASCII characters. Given that common UTF-8
4446 input will consist of an overwhelming majority of ASCII
4447 characters, we try to optimize for this case by checking
4448 as many characters as a C 'long' can contain.
4449 First, check if we can do an aligned read, as most CPUs have
4450 a penalty for unaligned reads.
4451 */
4452 if (!((size_t) s & LONG_PTR_MASK)) {
4453 /* Help register allocation */
4454 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004456 while (_s < aligned_end) {
4457 /* Read a whole long at a time (either 4 or 8 bytes),
4458 and do a fast unrolled copy if it only contains ASCII
4459 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 unsigned long value = *(unsigned long *) _s;
4461 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004463 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4464 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4465 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4466 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004467#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4469 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4470 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4471 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004472#endif
4473 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004475 }
4476 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004478 if (s == e)
4479 break;
4480 ch = (unsigned char)*s;
4481 }
4482 }
4483
4484 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004485 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 s++;
4487 continue;
4488 }
4489
4490 n = utf8_code_length[ch];
4491
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004492 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 if (consumed)
4494 break;
4495 else {
4496 errmsg = "unexpected end of data";
4497 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004498 endinpos = startinpos+1;
4499 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4500 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 goto utf8Error;
4502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504
4505 switch (n) {
4506
4507 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004508 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 startinpos = s-starts;
4510 endinpos = startinpos+1;
4511 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004514 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 startinpos = s-starts;
4516 endinpos = startinpos+1;
4517 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
4519 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004520 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004521 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004523 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 goto utf8Error;
4525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004528 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 break;
4530
4531 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004532 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4533 will result in surrogates in range d800-dfff. Surrogates are
4534 not valid UTF-8 so they are rejected.
4535 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4536 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004537 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004538 (s[2] & 0xc0) != 0x80 ||
4539 ((unsigned char)s[0] == 0xE0 &&
4540 (unsigned char)s[1] < 0xA0) ||
4541 ((unsigned char)s[0] == 0xED &&
4542 (unsigned char)s[1] > 0x9F)) {
4543 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 endinpos = startinpos + 1;
4546
4547 /* if s[1] first two bits are 1 and 0, then the invalid
4548 continuation byte is s[2], so increment endinpos by 1,
4549 if not, s[1] is invalid and endinpos doesn't need to
4550 be incremented. */
4551 if ((s[1] & 0xC0) == 0x80)
4552 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto utf8Error;
4554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004556 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004558 break;
4559
4560 case 4:
4561 if ((s[1] & 0xc0) != 0x80 ||
4562 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 (s[3] & 0xc0) != 0x80 ||
4564 ((unsigned char)s[0] == 0xF0 &&
4565 (unsigned char)s[1] < 0x90) ||
4566 ((unsigned char)s[0] == 0xF4 &&
4567 (unsigned char)s[1] > 0x8F)) {
4568 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004570 endinpos = startinpos + 1;
4571 if ((s[1] & 0xC0) == 0x80) {
4572 endinpos++;
4573 if ((s[2] & 0xC0) == 0x80)
4574 endinpos++;
4575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 goto utf8Error;
4577 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004578 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004579 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4580 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4581
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 }
4585 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004590 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004591 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004592 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 if (unicode_decode_call_errorhandler(
4595 errors, &errorHandler,
4596 "utf8", errmsg,
4597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 /* Update data because unicode_decode_call_errorhandler might have
4601 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 data = PyUnicode_DATA(unicode);
4603 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004606 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004607 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608
Walter Dörwald69652032004-09-07 20:24:22 +00004609 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612 /* Adjust length and ready string when it contained errors and
4613 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004615 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 goto onError;
4617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 Py_XDECREF(errorHandler);
4620 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004621 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004622 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 Py_XDECREF(errorHandler);
4626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 Py_DECREF(unicode);
4628 return NULL;
4629}
4630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004631#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004632
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004633#ifdef __APPLE__
4634
4635/* Simplified UTF-8 decoder using surrogateescape error handler,
4636 used to decode the command line arguments on Mac OS X. */
4637
4638wchar_t*
4639_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4640{
4641 int n;
4642 const char *e;
4643 wchar_t *unicode, *p;
4644
4645 /* Note: size will always be longer than the resulting Unicode
4646 character count */
4647 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4648 PyErr_NoMemory();
4649 return NULL;
4650 }
4651 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4652 if (!unicode)
4653 return NULL;
4654
4655 /* Unpack UTF-8 encoded data */
4656 p = unicode;
4657 e = s + size;
4658 while (s < e) {
4659 Py_UCS4 ch = (unsigned char)*s;
4660
4661 if (ch < 0x80) {
4662 *p++ = (wchar_t)ch;
4663 s++;
4664 continue;
4665 }
4666
4667 n = utf8_code_length[ch];
4668 if (s + n > e) {
4669 goto surrogateescape;
4670 }
4671
4672 switch (n) {
4673 case 0:
4674 case 1:
4675 goto surrogateescape;
4676
4677 case 2:
4678 if ((s[1] & 0xc0) != 0x80)
4679 goto surrogateescape;
4680 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4681 assert ((ch > 0x007F) && (ch <= 0x07FF));
4682 *p++ = (wchar_t)ch;
4683 break;
4684
4685 case 3:
4686 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4687 will result in surrogates in range d800-dfff. Surrogates are
4688 not valid UTF-8 so they are rejected.
4689 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4690 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4691 if ((s[1] & 0xc0) != 0x80 ||
4692 (s[2] & 0xc0) != 0x80 ||
4693 ((unsigned char)s[0] == 0xE0 &&
4694 (unsigned char)s[1] < 0xA0) ||
4695 ((unsigned char)s[0] == 0xED &&
4696 (unsigned char)s[1] > 0x9F)) {
4697
4698 goto surrogateescape;
4699 }
4700 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4701 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703 break;
4704
4705 case 4:
4706 if ((s[1] & 0xc0) != 0x80 ||
4707 (s[2] & 0xc0) != 0x80 ||
4708 (s[3] & 0xc0) != 0x80 ||
4709 ((unsigned char)s[0] == 0xF0 &&
4710 (unsigned char)s[1] < 0x90) ||
4711 ((unsigned char)s[0] == 0xF4 &&
4712 (unsigned char)s[1] > 0x8F)) {
4713 goto surrogateescape;
4714 }
4715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4717 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4718
4719#if SIZEOF_WCHAR_T == 4
4720 *p++ = (wchar_t)ch;
4721#else
4722 /* compute and append the two surrogates: */
4723
4724 /* translate from 10000..10FFFF to 0..FFFF */
4725 ch -= 0x10000;
4726
4727 /* high surrogate = top 10 bits added to D800 */
4728 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4729
4730 /* low surrogate = bottom 10 bits added to DC00 */
4731 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4732#endif
4733 break;
4734 }
4735 s += n;
4736 continue;
4737
4738 surrogateescape:
4739 *p++ = 0xDC00 + ch;
4740 s++;
4741 }
4742 *p = L'\0';
4743 return unicode;
4744}
4745
4746#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748/* Primary internal function which creates utf8 encoded bytes objects.
4749
4750 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004751 and allocate exactly as much space needed at the end. Else allocate the
4752 maximum possible needed (4 result bytes per Unicode character), and return
4753 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004754*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004755PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004756_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Tim Peters602f7402002-04-27 18:03:26 +00004758#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004759
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 Py_ssize_t i; /* index into s of next input byte */
4761 PyObject *result; /* result string object */
4762 char *p; /* next free byte in output buffer */
4763 Py_ssize_t nallocated; /* number of result bytes allocated */
4764 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004765 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004766 PyObject *errorHandler = NULL;
4767 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 int kind;
4769 void *data;
4770 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004771 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 if (!PyUnicode_Check(unicode)) {
4774 PyErr_BadArgument();
4775 return NULL;
4776 }
4777
4778 if (PyUnicode_READY(unicode) == -1)
4779 return NULL;
4780
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004781 if (PyUnicode_UTF8(unicode))
4782 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4783 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784
4785 kind = PyUnicode_KIND(unicode);
4786 data = PyUnicode_DATA(unicode);
4787 size = PyUnicode_GET_LENGTH(unicode);
4788
Tim Peters602f7402002-04-27 18:03:26 +00004789 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Tim Peters602f7402002-04-27 18:03:26 +00004791 if (size <= MAX_SHORT_UNICHARS) {
4792 /* Write into the stack buffer; nallocated can't overflow.
4793 * At the end, we'll allocate exactly as much heap space as it
4794 * turns out we need.
4795 */
4796 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004798 p = stackbuf;
4799 }
4800 else {
4801 /* Overallocate on the heap, and give the excess back at the end. */
4802 nallocated = size * 4;
4803 if (nallocated / 4 != size) /* overflow! */
4804 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004805 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004806 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004807 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004808 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004809 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004810
Tim Peters602f7402002-04-27 18:03:26 +00004811 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004813
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004814 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004815 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004817
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004819 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004820 *p++ = (char)(0xc0 | (ch >> 6));
4821 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004822 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004823 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824 Py_ssize_t repsize, k, startpos;
4825 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826 rep = unicode_encode_call_errorhandler(
4827 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004828 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829 if (!rep)
4830 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 if (PyBytes_Check(rep))
4833 repsize = PyBytes_GET_SIZE(rep);
4834 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004835 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836
4837 if (repsize > 4) {
4838 Py_ssize_t offset;
4839
4840 if (result == NULL)
4841 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004842 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4846 /* integer overflow */
4847 PyErr_NoMemory();
4848 goto error;
4849 }
4850 nallocated += repsize - 4;
4851 if (result != NULL) {
4852 if (_PyBytes_Resize(&result, nallocated) < 0)
4853 goto error;
4854 } else {
4855 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004856 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 goto error;
4858 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4859 }
4860 p = PyBytes_AS_STRING(result) + offset;
4861 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004863 if (PyBytes_Check(rep)) {
4864 char *prep = PyBytes_AS_STRING(rep);
4865 for(k = repsize; k > 0; k--)
4866 *p++ = *prep++;
4867 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004868 enum PyUnicode_Kind repkind;
4869 void *repdata;
4870
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004871 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004872 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004873 repkind = PyUnicode_KIND(rep);
4874 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004875
4876 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004877 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004879 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004880 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004881 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004883 goto error;
4884 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004885 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004886 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004887 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004888 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004889 } else if (ch < 0x10000) {
4890 *p++ = (char)(0xe0 | (ch >> 12));
4891 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4892 *p++ = (char)(0x80 | (ch & 0x3f));
4893 } else /* ch >= 0x10000 */ {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004894 assert(ch <= 0x10FFFF);
Tim Peters602f7402002-04-27 18:03:26 +00004895 /* Encode UCS4 Unicode ordinals */
4896 *p++ = (char)(0xf0 | (ch >> 18));
4897 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4898 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4899 *p++ = (char)(0x80 | (ch & 0x3f));
4900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004902
Guido van Rossum98297ee2007-11-06 21:34:58 +00004903 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004904 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004905 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004906 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004907 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004908 }
4909 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004910 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004911 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004912 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004913 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004916 Py_XDECREF(errorHandler);
4917 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004918 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004919 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004920 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004921 Py_XDECREF(errorHandler);
4922 Py_XDECREF(exc);
4923 Py_XDECREF(result);
4924 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004925
Tim Peters602f7402002-04-27 18:03:26 +00004926#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927}
4928
Alexander Belopolsky40018472011-02-26 01:02:56 +00004929PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4931 Py_ssize_t size,
4932 const char *errors)
4933{
4934 PyObject *v, *unicode;
4935
4936 unicode = PyUnicode_FromUnicode(s, size);
4937 if (unicode == NULL)
4938 return NULL;
4939 v = _PyUnicode_AsUTF8String(unicode, errors);
4940 Py_DECREF(unicode);
4941 return v;
4942}
4943
4944PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004945PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948}
4949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950/* --- UTF-32 Codec ------------------------------------------------------- */
4951
4952PyObject *
4953PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 Py_ssize_t size,
4955 const char *errors,
4956 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957{
4958 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4959}
4960
4961PyObject *
4962PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_ssize_t size,
4964 const char *errors,
4965 int *byteorder,
4966 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967{
4968 const char *starts = s;
4969 Py_ssize_t startinpos;
4970 Py_ssize_t endinpos;
4971 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004972 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004973 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 int bo = 0; /* assume native ordering by default */
4975 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976 /* Offsets from q for retrieving bytes in the right order. */
4977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4978 int iorder[] = {0, 1, 2, 3};
4979#else
4980 int iorder[] = {3, 2, 1, 0};
4981#endif
4982 PyObject *errorHandler = NULL;
4983 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004984
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 q = (unsigned char *)s;
4986 e = q + size;
4987
4988 if (byteorder)
4989 bo = *byteorder;
4990
4991 /* Check for BOM marks (U+FEFF) in the input and adjust current
4992 byte order setting accordingly. In native mode, the leading BOM
4993 mark is skipped, in all other modes, it is copied to the output
4994 stream as-is (giving a ZWNBSP character). */
4995 if (bo == 0) {
4996 if (size >= 4) {
4997 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 if (bom == 0x0000FEFF) {
5001 q += 4;
5002 bo = -1;
5003 }
5004 else if (bom == 0xFFFE0000) {
5005 q += 4;
5006 bo = 1;
5007 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if (bom == 0x0000FEFF) {
5010 q += 4;
5011 bo = 1;
5012 }
5013 else if (bom == 0xFFFE0000) {
5014 q += 4;
5015 bo = -1;
5016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019 }
5020
5021 if (bo == -1) {
5022 /* force LE */
5023 iorder[0] = 0;
5024 iorder[1] = 1;
5025 iorder[2] = 2;
5026 iorder[3] = 3;
5027 }
5028 else if (bo == 1) {
5029 /* force BE */
5030 iorder[0] = 3;
5031 iorder[1] = 2;
5032 iorder[2] = 1;
5033 iorder[3] = 0;
5034 }
5035
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005036 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005037 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005038 if (!unicode)
5039 return NULL;
5040 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005041 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005042 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005043
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 Py_UCS4 ch;
5046 /* remaining bytes at the end? (size should be divisible by 4) */
5047 if (e-q<4) {
5048 if (consumed)
5049 break;
5050 errmsg = "truncated data";
5051 startinpos = ((const char *)q)-starts;
5052 endinpos = ((const char *)e)-starts;
5053 goto utf32Error;
5054 /* The remaining input chars are ignored if the callback
5055 chooses to skip the input */
5056 }
5057 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5058 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 if (ch >= 0x110000)
5061 {
5062 errmsg = "codepoint not in range(0x110000)";
5063 startinpos = ((const char *)q)-starts;
5064 endinpos = startinpos+4;
5065 goto utf32Error;
5066 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005067 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 q += 4;
5070 continue;
5071 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 if (unicode_decode_call_errorhandler(
5073 errors, &errorHandler,
5074 "utf32", errmsg,
5075 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005076 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 }
5079
5080 if (byteorder)
5081 *byteorder = bo;
5082
5083 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
5086 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005087 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088 goto onError;
5089
5090 Py_XDECREF(errorHandler);
5091 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005092 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 Py_DECREF(unicode);
5096 Py_XDECREF(errorHandler);
5097 Py_XDECREF(exc);
5098 return NULL;
5099}
5100
5101PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005102_PyUnicode_EncodeUTF32(PyObject *str,
5103 const char *errors,
5104 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005106 int kind;
5107 void *data;
5108 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005111 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 /* Offsets from p for storing byte pairs in the right order. */
5113#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5114 int iorder[] = {0, 1, 2, 3};
5115#else
5116 int iorder[] = {3, 2, 1, 0};
5117#endif
5118
Benjamin Peterson29060642009-01-31 22:14:21 +00005119#define STORECHAR(CH) \
5120 do { \
5121 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5122 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5123 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5124 p[iorder[0]] = (CH) & 0xff; \
5125 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 } while(0)
5127
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128 if (!PyUnicode_Check(str)) {
5129 PyErr_BadArgument();
5130 return NULL;
5131 }
5132 if (PyUnicode_READY(str) < 0)
5133 return NULL;
5134 kind = PyUnicode_KIND(str);
5135 data = PyUnicode_DATA(str);
5136 len = PyUnicode_GET_LENGTH(str);
5137
5138 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005139 bytesize = nsize * 4;
5140 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143 if (v == NULL)
5144 return NULL;
5145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005146 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005147 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005149 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005150 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151
5152 if (byteorder == -1) {
5153 /* force LE */
5154 iorder[0] = 0;
5155 iorder[1] = 1;
5156 iorder[2] = 2;
5157 iorder[3] = 3;
5158 }
5159 else if (byteorder == 1) {
5160 /* force BE */
5161 iorder[0] = 3;
5162 iorder[1] = 2;
5163 iorder[2] = 1;
5164 iorder[3] = 0;
5165 }
5166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005167 for (i = 0; i < len; i++)
5168 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005169
5170 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005171 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005172#undef STORECHAR
5173}
5174
Alexander Belopolsky40018472011-02-26 01:02:56 +00005175PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005176PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5177 Py_ssize_t size,
5178 const char *errors,
5179 int byteorder)
5180{
5181 PyObject *result;
5182 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5183 if (tmp == NULL)
5184 return NULL;
5185 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5186 Py_DECREF(tmp);
5187 return result;
5188}
5189
5190PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005191PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192{
Victor Stinnerb960b342011-11-20 19:12:52 +01005193 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194}
5195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196/* --- UTF-16 Codec ------------------------------------------------------- */
5197
Tim Peters772747b2001-08-09 22:21:55 +00005198PyObject *
5199PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 Py_ssize_t size,
5201 const char *errors,
5202 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Walter Dörwald69652032004-09-07 20:24:22 +00005204 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5205}
5206
Antoine Pitrouab868312009-01-10 15:40:25 +00005207/* Two masks for fast checking of whether a C 'long' may contain
5208 UTF16-encoded surrogate characters. This is an efficient heuristic,
5209 assuming that non-surrogate characters with a code point >= 0x8000 are
5210 rare in most input.
5211 FAST_CHAR_MASK is used when the input is in native byte ordering,
5212 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005213*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005214#if (SIZEOF_LONG == 8)
5215# define FAST_CHAR_MASK 0x8000800080008000L
5216# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5217#elif (SIZEOF_LONG == 4)
5218# define FAST_CHAR_MASK 0x80008000L
5219# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5220#else
5221# error C 'long' size should be either 4 or 8!
5222#endif
5223
Walter Dörwald69652032004-09-07 20:24:22 +00005224PyObject *
5225PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 Py_ssize_t size,
5227 const char *errors,
5228 int *byteorder,
5229 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t startinpos;
5233 Py_ssize_t endinpos;
5234 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005235 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005236 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005237 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005238 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005239 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005240 /* Offsets from q for retrieving byte pairs in the right order. */
5241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5242 int ihi = 1, ilo = 0;
5243#else
5244 int ihi = 0, ilo = 1;
5245#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
5249 /* Note: size will always be longer than the resulting Unicode
5250 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005251 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 if (!unicode)
5253 return NULL;
5254 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005255 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005256 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005259 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005262 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005264 /* Check for BOM marks (U+FEFF) in the input and adjust current
5265 byte order setting accordingly. In native mode, the leading BOM
5266 mark is skipped, in all other modes, it is copied to the output
5267 stream as-is (giving a ZWNBSP character). */
5268 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005269 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005270 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005271#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 if (bom == 0xFEFF) {
5273 q += 2;
5274 bo = -1;
5275 }
5276 else if (bom == 0xFFFE) {
5277 q += 2;
5278 bo = 1;
5279 }
Tim Petersced69f82003-09-16 20:30:58 +00005280#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 if (bom == 0xFEFF) {
5282 q += 2;
5283 bo = 1;
5284 }
5285 else if (bom == 0xFFFE) {
5286 q += 2;
5287 bo = -1;
5288 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005289#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Tim Peters772747b2001-08-09 22:21:55 +00005293 if (bo == -1) {
5294 /* force LE */
5295 ihi = 1;
5296 ilo = 0;
5297 }
5298 else if (bo == 1) {
5299 /* force BE */
5300 ihi = 0;
5301 ilo = 1;
5302 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005303#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5304 native_ordering = ilo < ihi;
5305#else
5306 native_ordering = ilo > ihi;
5307#endif
Tim Peters772747b2001-08-09 22:21:55 +00005308
Antoine Pitrouab868312009-01-10 15:40:25 +00005309 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005310 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005311 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005312 /* First check for possible aligned read of a C 'long'. Unaligned
5313 reads are more expensive, better to defer to another iteration. */
5314 if (!((size_t) q & LONG_PTR_MASK)) {
5315 /* Fast path for runs of non-surrogate chars. */
5316 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005317 int kind = PyUnicode_KIND(unicode);
5318 void *data = PyUnicode_DATA(unicode);
5319 while (_q < aligned_end) {
5320 unsigned long block = * (unsigned long *) _q;
5321 unsigned short *pblock = (unsigned short*)&block;
5322 Py_UCS4 maxch;
5323 if (native_ordering) {
5324 /* Can use buffer directly */
5325 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005327 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005328 else {
5329 /* Need to byte-swap */
5330 unsigned char *_p = (unsigned char*)pblock;
5331 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005333 _p[0] = _q[1];
5334 _p[1] = _q[0];
5335 _p[2] = _q[3];
5336 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005337#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 _p[4] = _q[5];
5339 _p[5] = _q[4];
5340 _p[6] = _q[7];
5341 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005342#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 maxch = Py_MAX(pblock[0], pblock[1]);
5345#if SIZEOF_LONG == 8
5346 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5347#endif
5348 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5349 if (unicode_widen(&unicode, maxch) < 0)
5350 goto onError;
5351 kind = PyUnicode_KIND(unicode);
5352 data = PyUnicode_DATA(unicode);
5353 }
5354 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5355 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5356#if SIZEOF_LONG == 8
5357 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5358 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5359#endif
5360 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005361 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005362 q = _q;
5363 if (q >= e)
5364 break;
5365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367
Benjamin Peterson14339b62009-01-31 16:36:08 +00005368 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005369
5370 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005371 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5372 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 continue;
5374 }
5375
5376 /* UTF-16 code pair: */
5377 if (q > e) {
5378 errmsg = "unexpected end of data";
5379 startinpos = (((const char *)q) - 2) - starts;
5380 endinpos = ((const char *)e) + 1 - starts;
5381 goto utf16Error;
5382 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005383 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5384 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005386 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005387 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005388 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 continue;
5391 }
5392 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005393 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 startinpos = (((const char *)q)-4)-starts;
5395 endinpos = startinpos+2;
5396 goto utf16Error;
5397 }
5398
Benjamin Peterson14339b62009-01-31 16:36:08 +00005399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 errmsg = "illegal encoding";
5401 startinpos = (((const char *)q)-2)-starts;
5402 endinpos = startinpos+2;
5403 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005404
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005407 errors,
5408 &errorHandler,
5409 "utf16", errmsg,
5410 &starts,
5411 (const char **)&e,
5412 &startinpos,
5413 &endinpos,
5414 &exc,
5415 (const char **)&q,
5416 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005417 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005420 /* remaining byte at the end? (size should be even) */
5421 if (e == q) {
5422 if (!consumed) {
5423 errmsg = "truncated data";
5424 startinpos = ((const char *)q) - starts;
5425 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005426 if (unicode_decode_call_errorhandler(
5427 errors,
5428 &errorHandler,
5429 "utf16", errmsg,
5430 &starts,
5431 (const char **)&e,
5432 &startinpos,
5433 &endinpos,
5434 &exc,
5435 (const char **)&q,
5436 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005438 goto onError;
5439 /* The remaining input chars are ignored if the callback
5440 chooses to skip the input */
5441 }
5442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
5444 if (byteorder)
5445 *byteorder = bo;
5446
Walter Dörwald69652032004-09-07 20:24:22 +00005447 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 goto onError;
5453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 Py_XDECREF(errorHandler);
5455 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005456 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return NULL;
5463}
5464
Antoine Pitrouab868312009-01-10 15:40:25 +00005465#undef FAST_CHAR_MASK
5466#undef SWAPPED_FAST_CHAR_MASK
5467
Tim Peters772747b2001-08-09 22:21:55 +00005468PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469_PyUnicode_EncodeUTF16(PyObject *str,
5470 const char *errors,
5471 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473 int kind;
5474 void *data;
5475 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005477 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005479 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005480 /* Offsets from p for storing byte pairs in the right order. */
5481#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5482 int ihi = 1, ilo = 0;
5483#else
5484 int ihi = 0, ilo = 1;
5485#endif
5486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#define STORECHAR(CH) \
5488 do { \
5489 p[ihi] = ((CH) >> 8) & 0xff; \
5490 p[ilo] = (CH) & 0xff; \
5491 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005492 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005494 if (!PyUnicode_Check(str)) {
5495 PyErr_BadArgument();
5496 return NULL;
5497 }
5498 if (PyUnicode_READY(str) < 0)
5499 return NULL;
5500 kind = PyUnicode_KIND(str);
5501 data = PyUnicode_DATA(str);
5502 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005503
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005504 pairs = 0;
5505 if (kind == PyUnicode_4BYTE_KIND)
5506 for (i = 0; i < len; i++)
5507 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5508 pairs++;
5509 /* 2 * (len + pairs + (byteorder == 0)) */
5510 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005513 bytesize = nsize * 2;
5514 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005516 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 if (v == NULL)
5518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005524 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005525
5526 if (byteorder == -1) {
5527 /* force LE */
5528 ihi = 1;
5529 ilo = 0;
5530 }
5531 else if (byteorder == 1) {
5532 /* force BE */
5533 ihi = 0;
5534 ilo = 1;
5535 }
5536
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 for (i = 0; i < len; i++) {
5538 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5539 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 if (ch >= 0x10000) {
5541 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5542 ch = 0xD800 | ((ch-0x10000) >> 10);
5543 }
Tim Peters772747b2001-08-09 22:21:55 +00005544 STORECHAR(ch);
5545 if (ch2)
5546 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005547 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005548
5549 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005550 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005551#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552}
5553
Alexander Belopolsky40018472011-02-26 01:02:56 +00005554PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005555PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5556 Py_ssize_t size,
5557 const char *errors,
5558 int byteorder)
5559{
5560 PyObject *result;
5561 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5562 if (tmp == NULL)
5563 return NULL;
5564 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5565 Py_DECREF(tmp);
5566 return result;
5567}
5568
5569PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005570PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005572 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573}
5574
5575/* --- Unicode Escape Codec ----------------------------------------------- */
5576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5578 if all the escapes in the string make it still a valid ASCII string.
5579 Returns -1 if any escapes were found which cause the string to
5580 pop out of ASCII range. Otherwise returns the length of the
5581 required buffer to hold the string.
5582 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005583static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5585{
5586 const unsigned char *p = (const unsigned char *)s;
5587 const unsigned char *end = p + size;
5588 Py_ssize_t length = 0;
5589
5590 if (size < 0)
5591 return -1;
5592
5593 for (; p < end; ++p) {
5594 if (*p > 127) {
5595 /* Non-ASCII */
5596 return -1;
5597 }
5598 else if (*p != '\\') {
5599 /* Normal character */
5600 ++length;
5601 }
5602 else {
5603 /* Backslash-escape, check next char */
5604 ++p;
5605 /* Escape sequence reaches till end of string or
5606 non-ASCII follow-up. */
5607 if (p >= end || *p > 127)
5608 return -1;
5609 switch (*p) {
5610 case '\n':
5611 /* backslash + \n result in zero characters */
5612 break;
5613 case '\\': case '\'': case '\"':
5614 case 'b': case 'f': case 't':
5615 case 'n': case 'r': case 'v': case 'a':
5616 ++length;
5617 break;
5618 case '0': case '1': case '2': case '3':
5619 case '4': case '5': case '6': case '7':
5620 case 'x': case 'u': case 'U': case 'N':
5621 /* these do not guarantee ASCII characters */
5622 return -1;
5623 default:
5624 /* count the backslash + the other character */
5625 length += 2;
5626 }
5627 }
5628 }
5629 return length;
5630}
5631
Fredrik Lundh06d12682001-01-24 07:59:11 +00005632static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
5635PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005636 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t startinpos;
5641 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005643 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 char* message;
5646 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 PyObject *errorHandler = NULL;
5648 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653
5654 /* After length_of_escaped_ascii_string() there are two alternatives,
5655 either the string is pure ASCII with named escapes like \n, etc.
5656 and we determined it's exact size (common case)
5657 or it contains \x, \u, ... escape sequences. then we create a
5658 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005659 if (len >= 0) {
5660 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661 if (!v)
5662 goto onError;
5663 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005664 }
5665 else {
5666 /* Escaped strings will always be longer than the resulting
5667 Unicode string, so we start with size here and then reduce the
5668 length after conversion to the true value.
5669 (but if the error callback returns a long replacement string
5670 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005671 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005672 if (!v)
5673 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 }
5676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005678 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 while (s < end) {
5683 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005684 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 /* The only case in which i == ascii_length is a backslash
5688 followed by a newline. */
5689 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 /* Non-escape characters are interpreted as Unicode ordinals */
5692 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005693 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 continue;
5696 }
5697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 /* \ - Escapes */
5700 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005701 c = *s++;
5702 if (s > end)
5703 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705 /* The only case in which i == ascii_length is a backslash
5706 followed by a newline. */
5707 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005708
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005709 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712#define WRITECHAR(ch) \
5713 do { \
5714 if (unicode_putchar(&v, &i, ch) < 0) \
5715 goto onError; \
5716 }while(0)
5717
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 case '\\': WRITECHAR('\\'); break;
5720 case '\'': WRITECHAR('\''); break;
5721 case '\"': WRITECHAR('\"'); break;
5722 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 case 'f': WRITECHAR('\014'); break;
5725 case 't': WRITECHAR('\t'); break;
5726 case 'n': WRITECHAR('\n'); break;
5727 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 case '0': case '1': case '2': case '3':
5735 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005736 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005737 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005738 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005739 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005740 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005742 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 break;
5744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* hex escapes */
5746 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005748 digits = 2;
5749 message = "truncated \\xXX escape";
5750 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 digits = 4;
5755 message = "truncated \\uXXXX escape";
5756 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005759 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005760 digits = 8;
5761 message = "truncated \\UXXXXXXXX escape";
5762 hexescape:
5763 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 if (s+digits>end) {
5765 endinpos = size;
5766 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 errors, &errorHandler,
5768 "unicodeescape", "end of string in escape sequence",
5769 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 goto onError;
5772 goto nextByte;
5773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 for (j = 0; j < digits; ++j) {
5775 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005776 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 errors, &errorHandler,
5780 "unicodeescape", message,
5781 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005782 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005783 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005784 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005786 }
5787 chr = (chr<<4) & ~0xF;
5788 if (c >= '0' && c <= '9')
5789 chr += c - '0';
5790 else if (c >= 'a' && c <= 'f')
5791 chr += 10 + c - 'a';
5792 else
5793 chr += 10 + c - 'A';
5794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005796 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 /* _decoding_error will have already written into the
5798 target buffer. */
5799 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005801 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005802 if (chr <= 0x10ffff) {
5803 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 errors, &errorHandler,
5808 "unicodeescape", "illegal Unicode character",
5809 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005810 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005811 goto onError;
5812 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 break;
5814
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 case 'N':
5817 message = "malformed \\N character escape";
5818 if (ucnhash_CAPI == NULL) {
5819 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5821 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 if (ucnhash_CAPI == NULL)
5823 goto ucnhashError;
5824 }
5825 if (*s == '{') {
5826 const char *start = s+1;
5827 /* look for the closing brace */
5828 while (*s != '}' && s < end)
5829 s++;
5830 if (s > start && s < end && *s == '}') {
5831 /* found a name. look it up in the unicode database */
5832 message = "unknown Unicode character name";
5833 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005835 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005836 goto store;
5837 }
5838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 errors, &errorHandler,
5842 "unicodeescape", message,
5843 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005845 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005846 break;
5847
5848 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005849 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 message = "\\ at end of string";
5851 s--;
5852 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 errors, &errorHandler,
5855 "unicodeescape", message,
5856 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005857 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005858 goto onError;
5859 }
5860 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005861 WRITECHAR('\\');
5862 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005863 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005864 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005869#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 if (PyUnicode_Resize(&v, i) < 0)
5872 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005873 Py_XDECREF(errorHandler);
5874 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005875 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005876
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005878 PyErr_SetString(
5879 PyExc_UnicodeError,
5880 "\\N escapes not supported (can't load unicodedata module)"
5881 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005882 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 Py_XDECREF(errorHandler);
5884 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005885 return NULL;
5886
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 return NULL;
5892}
5893
5894/* Return a Unicode-Escape string version of the Unicode object.
5895
5896 If quotes is true, the string is enclosed in u"" or u'' quotes as
5897 appropriate.
5898
5899*/
5900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 int kind;
5908 void *data;
5909 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Thomas Wouters89f507f2006-12-13 04:49:30 +00005911 /* Initial allocation is based on the longest-possible unichr
5912 escape.
5913
5914 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5915 unichr, so in this case it's the longest unichr escape. In
5916 narrow (UTF-16) builds this is five chars per source unichr
5917 since there are two unichrs in the surrogate pair, so in narrow
5918 (UTF-16) builds it's not the longest unichr escape.
5919
5920 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5921 so in the narrow (UTF-16) build case it's the longest unichr
5922 escape.
5923 */
5924
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (!PyUnicode_Check(unicode)) {
5926 PyErr_BadArgument();
5927 return NULL;
5928 }
5929 if (PyUnicode_READY(unicode) < 0)
5930 return NULL;
5931 len = PyUnicode_GET_LENGTH(unicode);
5932 kind = PyUnicode_KIND(unicode);
5933 data = PyUnicode_DATA(unicode);
5934 switch(kind) {
5935 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5936 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5937 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5938 }
5939
5940 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005941 return PyBytes_FromStringAndSize(NULL, 0);
5942
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (repr == NULL)
5951 return NULL;
5952
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005956 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Walter Dörwald79e913e2007-05-12 11:08:06 +00005958 /* Escape backslashes */
5959 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 *p++ = '\\';
5961 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005962 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005965 /* Map 21-bit characters to '\U00xxxxxx' */
5966 else if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01005967 assert(ch <= 0x10FFFF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005968 *p++ = '\\';
5969 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005970 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5971 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5972 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5973 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5974 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5975 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5977 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005979 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005982 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 *p++ = '\\';
5984 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005985 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5986 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5987 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5988 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005991 /* Map special whitespace to '\t', \n', '\r' */
5992 else if (ch == '\t') {
5993 *p++ = '\\';
5994 *p++ = 't';
5995 }
5996 else if (ch == '\n') {
5997 *p++ = '\\';
5998 *p++ = 'n';
5999 }
6000 else if (ch == '\r') {
6001 *p++ = '\\';
6002 *p++ = 'r';
6003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006004
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006005 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006006 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006009 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6010 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 /* Copy everything else as-is */
6014 else
6015 *p++ = (char) ch;
6016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006018 assert(p - PyBytes_AS_STRING(repr) > 0);
6019 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6020 return NULL;
6021 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6026 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 PyObject *result;
6029 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6030 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 result = PyUnicode_AsUnicodeEscapeString(tmp);
6033 Py_DECREF(tmp);
6034 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
6037/* --- Raw Unicode Escape Codec ------------------------------------------- */
6038
Alexander Belopolsky40018472011-02-26 01:02:56 +00006039PyObject *
6040PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006041 Py_ssize_t size,
6042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t startinpos;
6046 Py_ssize_t endinpos;
6047 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006048 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 const char *end;
6050 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 PyObject *errorHandler = NULL;
6052 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006053
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 /* Escaped strings will always be longer than the resulting
6055 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 length after conversion to the true value. (But decoding error
6057 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006058 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006062 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 end = s + size;
6065 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 unsigned char c;
6067 Py_UCS4 x;
6068 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006069 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* Non-escape characters are interpreted as Unicode ordinals */
6072 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006073 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006076 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 startinpos = s-starts;
6078
6079 /* \u-escapes are only interpreted iff the number of leading
6080 backslashes if odd */
6081 bs = s;
6082 for (;s < end;) {
6083 if (*s != '\\')
6084 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6086 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 }
6088 if (((s - bs) & 1) == 0 ||
6089 s >= end ||
6090 (*s != 'u' && *s != 'U')) {
6091 continue;
6092 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 count = *s=='u' ? 4 : 8;
6095 s++;
6096
6097 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 for (x = 0, i = 0; i < count; ++i, ++s) {
6099 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006100 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 endinpos = s-starts;
6102 if (unicode_decode_call_errorhandler(
6103 errors, &errorHandler,
6104 "rawunicodeescape", "truncated \\uXXXX",
6105 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 goto onError;
6108 goto nextByte;
6109 }
6110 x = (x<<4) & ~0xF;
6111 if (c >= '0' && c <= '9')
6112 x += c - '0';
6113 else if (c >= 'a' && c <= 'f')
6114 x += 10 + c - 'a';
6115 else
6116 x += 10 + c - 'A';
6117 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006118 if (x <= 0x10ffff) {
6119 if (unicode_putchar(&v, &outpos, x) < 0)
6120 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006121 } else {
6122 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006123 if (unicode_decode_call_errorhandler(
6124 errors, &errorHandler,
6125 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 nextByte:
6131 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006133 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006137 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
6144}
6145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006150 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 char *p;
6152 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 Py_ssize_t expandsize, pos;
6154 int kind;
6155 void *data;
6156 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (!PyUnicode_Check(unicode)) {
6159 PyErr_BadArgument();
6160 return NULL;
6161 }
6162 if (PyUnicode_READY(unicode) < 0)
6163 return NULL;
6164 kind = PyUnicode_KIND(unicode);
6165 data = PyUnicode_DATA(unicode);
6166 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson02686752011-11-22 15:29:32 -05006167 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006168
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006171
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 if (repr == NULL)
6174 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006176 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 for (pos = 0; pos < len; pos++) {
6180 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 /* Map 32-bit characters to '\Uxxxxxxxx' */
6182 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01006183 assert(ch <= 0x10FFFF);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006184 *p++ = '\\';
6185 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006186 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6189 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6190 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6191 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6192 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6193 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 *p++ = '\\';
6198 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006199 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6202 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* Copy everything else as-is */
6205 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 *p++ = (char) ch;
6207 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006208
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209 assert(p > q);
6210 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006211 return NULL;
6212 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Alexander Belopolsky40018472011-02-26 01:02:56 +00006215PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6217 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006219 PyObject *result;
6220 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6221 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006222 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006223 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6224 Py_DECREF(tmp);
6225 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006228/* --- Unicode Internal Codec ------------------------------------------- */
6229
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230PyObject *
6231_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 Py_ssize_t size,
6233 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234{
6235 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006236 Py_ssize_t startinpos;
6237 Py_ssize_t endinpos;
6238 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006239 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006240 const char *end;
6241 const char *reason;
6242 PyObject *errorHandler = NULL;
6243 PyObject *exc = NULL;
6244
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006245 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006246 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006247 1))
6248 return NULL;
6249
Thomas Wouters89f507f2006-12-13 04:49:30 +00006250 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006251 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006252 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006254 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006255 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006256 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006257 end = s + size;
6258
6259 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006260 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006261 Py_UCS4 ch;
6262 /* We copy the raw representation one byte at a time because the
6263 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006264 ((char *) &uch)[0] = s[0];
6265 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006266#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006267 ((char *) &uch)[2] = s[2];
6268 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006269#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006270 ch = uch;
6271
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 /* We have to sanity check the raw data, otherwise doom looms for
6273 some malformed UCS-4 data. */
6274 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006275#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006276 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006277#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006278 end-s < Py_UNICODE_SIZE
6279 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 startinpos = s - starts;
6282 if (end-s < Py_UNICODE_SIZE) {
6283 endinpos = end-starts;
6284 reason = "truncated input";
6285 }
6286 else {
6287 endinpos = s - starts + Py_UNICODE_SIZE;
6288 reason = "illegal code point (> 0x10FFFF)";
6289 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 if (unicode_decode_call_errorhandler(
6291 errors, &errorHandler,
6292 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006293 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006296 continue;
6297 }
6298
6299 s += Py_UNICODE_SIZE;
6300#ifndef Py_UNICODE_WIDE
6301 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6302 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006303 Py_UNICODE uch2;
6304 ((char *) &uch2)[0] = s[0];
6305 ((char *) &uch2)[1] = s[1];
6306 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006308 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006309 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006310 }
6311 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006312#endif
6313
6314 if (unicode_putchar(&v, &outpos, ch) < 0)
6315 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 }
6317
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006318 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 goto onError;
6320 Py_XDECREF(errorHandler);
6321 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006322 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 Py_XDECREF(v);
6326 Py_XDECREF(errorHandler);
6327 Py_XDECREF(exc);
6328 return NULL;
6329}
6330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331/* --- Latin-1 Codec ------------------------------------------------------ */
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
6334PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006335 Py_ssize_t size,
6336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006339 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340}
6341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343static void
6344make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006346 PyObject *unicode,
6347 Py_ssize_t startpos, Py_ssize_t endpos,
6348 const char *reason)
6349{
6350 if (*exceptionObject == NULL) {
6351 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006353 encoding, unicode, startpos, endpos, reason);
6354 }
6355 else {
6356 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6357 goto onError;
6358 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6359 goto onError;
6360 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6361 goto onError;
6362 return;
6363 onError:
6364 Py_DECREF(*exceptionObject);
6365 *exceptionObject = NULL;
6366 }
6367}
6368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370static void
6371raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006372 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006373 PyObject *unicode,
6374 Py_ssize_t startpos, Py_ssize_t endpos,
6375 const char *reason)
6376{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006377 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006378 encoding, unicode, startpos, endpos, reason);
6379 if (*exceptionObject != NULL)
6380 PyCodec_StrictErrors(*exceptionObject);
6381}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382
6383/* error handling callback helper:
6384 build arguments, call the callback and check the arguments,
6385 put the result into newpos and return the replacement string, which
6386 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387static PyObject *
6388unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006389 PyObject **errorHandler,
6390 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 Py_ssize_t startpos, Py_ssize_t endpos,
6393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006395 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 PyObject *restuple;
6398 PyObject *resunicode;
6399
6400 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 }
6405
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 if (PyUnicode_READY(unicode) < 0)
6407 return NULL;
6408 len = PyUnicode_GET_LENGTH(unicode);
6409
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006410 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414
6415 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006420 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 &resunicode, newpos)) {
6426 Py_DECREF(restuple);
6427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6430 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6431 Py_DECREF(restuple);
6432 return NULL;
6433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 *newpos = len + *newpos;
6436 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6438 Py_DECREF(restuple);
6439 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 Py_INCREF(resunicode);
6442 Py_DECREF(restuple);
6443 return resunicode;
6444}
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006449 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 /* input state */
6452 Py_ssize_t pos=0, size;
6453 int kind;
6454 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 /* output object */
6456 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* pointer into the output */
6458 char *str;
6459 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006461 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6462 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 PyObject *errorHandler = NULL;
6464 PyObject *exc = NULL;
6465 /* the following variable is used for caching string comparisons
6466 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6467 int known_errorHandler = -1;
6468
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 if (PyUnicode_READY(unicode) < 0)
6470 return NULL;
6471 size = PyUnicode_GET_LENGTH(unicode);
6472 kind = PyUnicode_KIND(unicode);
6473 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 /* allocate enough for a simple encoding without
6475 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006477 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006478 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006481 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 ressize = size;
6483
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 while (pos < size) {
6485 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* can we encode this? */
6488 if (c<limit) {
6489 /* no overflow check, because we know that the space is enough */
6490 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006492 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 Py_ssize_t requiredsize;
6495 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t collstart = pos;
6499 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 ++collend;
6503 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6504 if (known_errorHandler==-1) {
6505 if ((errors==NULL) || (!strcmp(errors, "strict")))
6506 known_errorHandler = 1;
6507 else if (!strcmp(errors, "replace"))
6508 known_errorHandler = 2;
6509 else if (!strcmp(errors, "ignore"))
6510 known_errorHandler = 3;
6511 else if (!strcmp(errors, "xmlcharrefreplace"))
6512 known_errorHandler = 4;
6513 else
6514 known_errorHandler = 0;
6515 }
6516 switch (known_errorHandler) {
6517 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006518 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 goto onError;
6520 case 2: /* replace */
6521 while (collstart++<collend)
6522 *str++ = '?'; /* fall through */
6523 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 break;
6526 case 4: /* xmlcharrefreplace */
6527 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 /* determine replacement size */
6529 for (i = collstart, repsize = 0; i < collend; ++i) {
6530 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6531 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006543 else {
6544 assert(ch <= 0x10FFFF);
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006548 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 if (requiredsize > ressize) {
6550 if (requiredsize<2*ressize)
6551 requiredsize = 2*ressize;
6552 if (_PyBytes_Resize(&res, requiredsize))
6553 goto onError;
6554 str = PyBytes_AS_STRING(res) + respos;
6555 ressize = requiredsize;
6556 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 /* generate replacement */
6558 for (i = collstart; i < collend; ++i) {
6559 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 break;
6563 default:
6564 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006565 encoding, reason, unicode, &exc,
6566 collstart, collend, &newpos);
6567 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6568 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006570 if (PyBytes_Check(repunicode)) {
6571 /* Directly copy bytes result to output. */
6572 repsize = PyBytes_Size(repunicode);
6573 if (repsize > 1) {
6574 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006575 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006576 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6577 Py_DECREF(repunicode);
6578 goto onError;
6579 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006580 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006581 ressize += repsize-1;
6582 }
6583 memcpy(str, PyBytes_AsString(repunicode), repsize);
6584 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006586 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006587 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 /* need more space? (at least enough for what we
6590 have+the replacement+the rest of the string, so
6591 we won't have to check space for encodable characters) */
6592 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 repsize = PyUnicode_GET_LENGTH(repunicode);
6594 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 if (requiredsize > ressize) {
6596 if (requiredsize<2*ressize)
6597 requiredsize = 2*ressize;
6598 if (_PyBytes_Resize(&res, requiredsize)) {
6599 Py_DECREF(repunicode);
6600 goto onError;
6601 }
6602 str = PyBytes_AS_STRING(res) + respos;
6603 ressize = requiredsize;
6604 }
6605 /* check if there is anything unencodable in the replacement
6606 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 for (i = 0; repsize-->0; ++i, ++str) {
6608 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006610 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 Py_DECREF(repunicode);
6613 goto onError;
6614 }
6615 *str = (char)c;
6616 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 }
6621 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006622 /* Resize if we allocated to much */
6623 size = str - PyBytes_AS_STRING(res);
6624 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006625 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006626 if (_PyBytes_Resize(&res, size) < 0)
6627 goto onError;
6628 }
6629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630 Py_XDECREF(errorHandler);
6631 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006632 return res;
6633
6634 onError:
6635 Py_XDECREF(res);
6636 Py_XDECREF(errorHandler);
6637 Py_XDECREF(exc);
6638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639}
6640
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
6643PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 Py_ssize_t size,
6645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 PyObject *result;
6648 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6649 if (unicode == NULL)
6650 return NULL;
6651 result = unicode_encode_ucs1(unicode, errors, 256);
6652 Py_DECREF(unicode);
6653 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654}
6655
Alexander Belopolsky40018472011-02-26 01:02:56 +00006656PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006657_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
6659 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 PyErr_BadArgument();
6661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 if (PyUnicode_READY(unicode) == -1)
6664 return NULL;
6665 /* Fast path: if it is a one-byte string, construct
6666 bytes object directly. */
6667 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6668 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6669 PyUnicode_GET_LENGTH(unicode));
6670 /* Non-Latin-1 characters present. Defer to above function to
6671 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006673}
6674
6675PyObject*
6676PyUnicode_AsLatin1String(PyObject *unicode)
6677{
6678 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
6681/* --- 7-bit ASCII Codec -------------------------------------------------- */
6682
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683PyObject *
6684PyUnicode_DecodeASCII(const char *s,
6685 Py_ssize_t size,
6686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006689 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006690 int kind;
6691 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006692 Py_ssize_t startinpos;
6693 Py_ssize_t endinpos;
6694 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006696 int has_error;
6697 const unsigned char *p = (const unsigned char *)s;
6698 const unsigned char *end = p + size;
6699 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 PyObject *errorHandler = NULL;
6701 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006702
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703 if (size == 0) {
6704 Py_INCREF(unicode_empty);
6705 return unicode_empty;
6706 }
6707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006709 if (size == 1 && (unsigned char)s[0] < 128)
6710 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006711
Victor Stinner702c7342011-10-05 13:50:52 +02006712 has_error = 0;
6713 while (p < end && !has_error) {
6714 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6715 an explanation. */
6716 if (!((size_t) p & LONG_PTR_MASK)) {
6717 /* Help register allocation */
6718 register const unsigned char *_p = p;
6719 while (_p < aligned_end) {
6720 unsigned long value = *(unsigned long *) _p;
6721 if (value & ASCII_CHAR_MASK) {
6722 has_error = 1;
6723 break;
6724 }
6725 _p += SIZEOF_LONG;
6726 }
6727 if (_p == end)
6728 break;
6729 if (has_error)
6730 break;
6731 p = _p;
6732 }
6733 if (*p & 0x80) {
6734 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006736 }
6737 else {
6738 ++p;
6739 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006740 }
Victor Stinner702c7342011-10-05 13:50:52 +02006741 if (!has_error)
6742 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006743
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006744 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006748 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006749 kind = PyUnicode_KIND(v);
6750 data = PyUnicode_DATA(v);
6751 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 e = s + size;
6753 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 register unsigned char c = (unsigned char)*s;
6755 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006756 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 ++s;
6758 }
6759 else {
6760 startinpos = s-starts;
6761 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 if (unicode_decode_call_errorhandler(
6763 errors, &errorHandler,
6764 "ascii", "ordinal not in range(128)",
6765 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006766 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 kind = PyUnicode_KIND(v);
6769 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006772 if (PyUnicode_Resize(&v, outpos) < 0)
6773 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006774 Py_XDECREF(errorHandler);
6775 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006776 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006777 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 Py_XDECREF(errorHandler);
6782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 return NULL;
6784}
6785
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006786/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006787PyObject *
6788PyUnicode_EncodeASCII(const Py_UNICODE *p,
6789 Py_ssize_t size,
6790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 PyObject *result;
6793 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6794 if (unicode == NULL)
6795 return NULL;
6796 result = unicode_encode_ucs1(unicode, errors, 128);
6797 Py_DECREF(unicode);
6798 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006802_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
6804 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 PyErr_BadArgument();
6806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808 if (PyUnicode_READY(unicode) == -1)
6809 return NULL;
6810 /* Fast path: if it is an ASCII-only string, construct bytes object
6811 directly. Else defer to above function to raise the exception. */
6812 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6813 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6814 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006815 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006816}
6817
6818PyObject *
6819PyUnicode_AsASCIIString(PyObject *unicode)
6820{
6821 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
Victor Stinner99b95382011-07-04 14:23:54 +02006824#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006826/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006827
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006828#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829#define NEED_RETRY
6830#endif
6831
Victor Stinner3a50e702011-10-18 21:21:00 +02006832#ifndef WC_ERR_INVALID_CHARS
6833# define WC_ERR_INVALID_CHARS 0x0080
6834#endif
6835
6836static char*
6837code_page_name(UINT code_page, PyObject **obj)
6838{
6839 *obj = NULL;
6840 if (code_page == CP_ACP)
6841 return "mbcs";
6842 if (code_page == CP_UTF7)
6843 return "CP_UTF7";
6844 if (code_page == CP_UTF8)
6845 return "CP_UTF8";
6846
6847 *obj = PyBytes_FromFormat("cp%u", code_page);
6848 if (*obj == NULL)
6849 return NULL;
6850 return PyBytes_AS_STRING(*obj);
6851}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006854is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855{
6856 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 if (!IsDBCSLeadByteEx(code_page, *curr))
6860 return 0;
6861
6862 prev = CharPrevExA(code_page, s, curr, 0);
6863 if (prev == curr)
6864 return 1;
6865 /* FIXME: This code is limited to "true" double-byte encodings,
6866 as it assumes an incomplete character consists of a single
6867 byte. */
6868 if (curr - prev == 2)
6869 return 1;
6870 if (!IsDBCSLeadByteEx(code_page, *prev))
6871 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872 return 0;
6873}
6874
Victor Stinner3a50e702011-10-18 21:21:00 +02006875static DWORD
6876decode_code_page_flags(UINT code_page)
6877{
6878 if (code_page == CP_UTF7) {
6879 /* The CP_UTF7 decoder only supports flags=0 */
6880 return 0;
6881 }
6882 else
6883 return MB_ERR_INVALID_CHARS;
6884}
6885
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 * Decode a byte string from a Windows code page into unicode object in strict
6888 * mode.
6889 *
6890 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6891 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006894decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006895 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 const char *in,
6897 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898{
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006900 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902
6903 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 assert(insize > 0);
6905 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6906 if (outsize <= 0)
6907 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908
6909 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006911 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 if (*v == NULL)
6913 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915 }
6916 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006919 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922 }
6923
6924 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6926 if (outsize <= 0)
6927 goto error;
6928 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006929
Victor Stinner3a50e702011-10-18 21:21:00 +02006930error:
6931 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6932 return -2;
6933 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006935}
6936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937/*
6938 * Decode a byte string from a code page into unicode object with an error
6939 * handler.
6940 *
6941 * Returns consumed size if succeed, or raise a WindowsError or
6942 * UnicodeDecodeError exception and returns -1 on error.
6943 */
6944static int
6945decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006946 PyObject **v,
6947 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 const char *errors)
6949{
6950 const char *startin = in;
6951 const char *endin = in + size;
6952 const DWORD flags = decode_code_page_flags(code_page);
6953 /* Ideally, we should get reason from FormatMessage. This is the Windows
6954 2000 English version of the message. */
6955 const char *reason = "No mapping for the Unicode character exists "
6956 "in the target code page.";
6957 /* each step cannot decode more than 1 character, but a character can be
6958 represented as a surrogate pair */
6959 wchar_t buffer[2], *startout, *out;
6960 int insize, outsize;
6961 PyObject *errorHandler = NULL;
6962 PyObject *exc = NULL;
6963 PyObject *encoding_obj = NULL;
6964 char *encoding;
6965 DWORD err;
6966 int ret = -1;
6967
6968 assert(size > 0);
6969
6970 encoding = code_page_name(code_page, &encoding_obj);
6971 if (encoding == NULL)
6972 return -1;
6973
6974 if (errors == NULL || strcmp(errors, "strict") == 0) {
6975 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6976 UnicodeDecodeError. */
6977 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6978 if (exc != NULL) {
6979 PyCodec_StrictErrors(exc);
6980 Py_CLEAR(exc);
6981 }
6982 goto error;
6983 }
6984
6985 if (*v == NULL) {
6986 /* Create unicode object */
6987 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6988 PyErr_NoMemory();
6989 goto error;
6990 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006991 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 if (*v == NULL)
6993 goto error;
6994 startout = PyUnicode_AS_UNICODE(*v);
6995 }
6996 else {
6997 /* Extend unicode object */
6998 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6999 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7000 PyErr_NoMemory();
7001 goto error;
7002 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 goto error;
7005 startout = PyUnicode_AS_UNICODE(*v) + n;
7006 }
7007
7008 /* Decode the byte string character per character */
7009 out = startout;
7010 while (in < endin)
7011 {
7012 /* Decode a character */
7013 insize = 1;
7014 do
7015 {
7016 outsize = MultiByteToWideChar(code_page, flags,
7017 in, insize,
7018 buffer, Py_ARRAY_LENGTH(buffer));
7019 if (outsize > 0)
7020 break;
7021 err = GetLastError();
7022 if (err != ERROR_NO_UNICODE_TRANSLATION
7023 && err != ERROR_INSUFFICIENT_BUFFER)
7024 {
7025 PyErr_SetFromWindowsErr(0);
7026 goto error;
7027 }
7028 insize++;
7029 }
7030 /* 4=maximum length of a UTF-8 sequence */
7031 while (insize <= 4 && (in + insize) <= endin);
7032
7033 if (outsize <= 0) {
7034 Py_ssize_t startinpos, endinpos, outpos;
7035
7036 startinpos = in - startin;
7037 endinpos = startinpos + 1;
7038 outpos = out - PyUnicode_AS_UNICODE(*v);
7039 if (unicode_decode_call_errorhandler(
7040 errors, &errorHandler,
7041 encoding, reason,
7042 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007043 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 {
7045 goto error;
7046 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007047 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 }
7049 else {
7050 in += insize;
7051 memcpy(out, buffer, outsize * sizeof(wchar_t));
7052 out += outsize;
7053 }
7054 }
7055
7056 /* write a NUL character at the end */
7057 *out = 0;
7058
7059 /* Extend unicode object */
7060 outsize = out - startout;
7061 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007065
7066error:
7067 Py_XDECREF(encoding_obj);
7068 Py_XDECREF(errorHandler);
7069 Py_XDECREF(exc);
7070 return ret;
7071}
7072
Victor Stinner3a50e702011-10-18 21:21:00 +02007073static PyObject *
7074decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 const char *s, Py_ssize_t size,
7076 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077{
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 PyObject *v = NULL;
7079 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 if (code_page < 0) {
7082 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7083 return NULL;
7084 }
7085
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner76a31a62011-11-04 00:05:13 +01007089 do
7090 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007092 if (size > INT_MAX) {
7093 chunk_size = INT_MAX;
7094 final = 0;
7095 done = 0;
7096 }
7097 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007099 {
7100 chunk_size = (int)size;
7101 final = (consumed == NULL);
7102 done = 1;
7103 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
Victor Stinner76a31a62011-11-04 00:05:13 +01007105 /* Skip trailing lead-byte unless 'final' is set */
7106 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7107 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108
Victor Stinner76a31a62011-11-04 00:05:13 +01007109 if (chunk_size == 0 && done) {
7110 if (v != NULL)
7111 break;
7112 Py_INCREF(unicode_empty);
7113 return unicode_empty;
7114 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115
Victor Stinner76a31a62011-11-04 00:05:13 +01007116
7117 converted = decode_code_page_strict(code_page, &v,
7118 s, chunk_size);
7119 if (converted == -2)
7120 converted = decode_code_page_errors(code_page, &v,
7121 s, chunk_size,
7122 errors);
7123 assert(converted != 0);
7124
7125 if (converted < 0) {
7126 Py_XDECREF(v);
7127 return NULL;
7128 }
7129
7130 if (consumed)
7131 *consumed += converted;
7132
7133 s += converted;
7134 size -= converted;
7135 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007136
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007137 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138}
7139
Alexander Belopolsky40018472011-02-26 01:02:56 +00007140PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007141PyUnicode_DecodeCodePageStateful(int code_page,
7142 const char *s,
7143 Py_ssize_t size,
7144 const char *errors,
7145 Py_ssize_t *consumed)
7146{
7147 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7148}
7149
7150PyObject *
7151PyUnicode_DecodeMBCSStateful(const char *s,
7152 Py_ssize_t size,
7153 const char *errors,
7154 Py_ssize_t *consumed)
7155{
7156 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7157}
7158
7159PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160PyUnicode_DecodeMBCS(const char *s,
7161 Py_ssize_t size,
7162 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7165}
7166
Victor Stinner3a50e702011-10-18 21:21:00 +02007167static DWORD
7168encode_code_page_flags(UINT code_page, const char *errors)
7169{
7170 if (code_page == CP_UTF8) {
7171 if (winver.dwMajorVersion >= 6)
7172 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7173 and later */
7174 return WC_ERR_INVALID_CHARS;
7175 else
7176 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7177 return 0;
7178 }
7179 else if (code_page == CP_UTF7) {
7180 /* CP_UTF7 only supports flags=0 */
7181 return 0;
7182 }
7183 else {
7184 if (errors != NULL && strcmp(errors, "replace") == 0)
7185 return 0;
7186 else
7187 return WC_NO_BEST_FIT_CHARS;
7188 }
7189}
7190
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 * Encode a Unicode string to a Windows code page into a byte string in strict
7193 * mode.
7194 *
7195 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7196 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007198static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007199encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202{
Victor Stinner554f3f02010-06-16 23:33:54 +00007203 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 BOOL *pusedDefaultChar = &usedDefaultChar;
7205 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007207 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 const DWORD flags = encode_code_page_flags(code_page, NULL);
7210 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 /* Create a substring so that we can get the UTF-16 representation
7212 of just the slice under consideration. */
7213 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007216
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007218 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007220 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007221
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 substring = PyUnicode_Substring(unicode, offset, offset+len);
7223 if (substring == NULL)
7224 return -1;
7225 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7226 if (p == NULL) {
7227 Py_DECREF(substring);
7228 return -1;
7229 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007230
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007231 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 outsize = WideCharToMultiByte(code_page, flags,
7233 p, size,
7234 NULL, 0,
7235 NULL, pusedDefaultChar);
7236 if (outsize <= 0)
7237 goto error;
7238 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 if (pusedDefaultChar && *pusedDefaultChar) {
7240 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007243
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007247 if (*outbytes == NULL) {
7248 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007252 }
7253 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 const Py_ssize_t n = PyBytes_Size(*outbytes);
7256 if (outsize > PY_SSIZE_T_MAX - n) {
7257 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007258 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7262 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007264 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266 }
7267
7268 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 outsize = WideCharToMultiByte(code_page, flags,
7270 p, size,
7271 out, outsize,
7272 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 if (outsize <= 0)
7275 goto error;
7276 if (pusedDefaultChar && *pusedDefaultChar)
7277 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007279
Victor Stinner3a50e702011-10-18 21:21:00 +02007280error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007281 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7283 return -2;
7284 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007285 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007286}
7287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288/*
7289 * Encode a Unicode string to a Windows code page into a byte string using a
7290 * error handler.
7291 *
7292 * Returns consumed characters if succeed, or raise a WindowsError and returns
7293 * -1 on other error.
7294 */
7295static int
7296encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007297 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007298 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007299{
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007301 Py_ssize_t pos = unicode_offset;
7302 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 /* Ideally, we should get reason from FormatMessage. This is the Windows
7304 2000 English version of the message. */
7305 const char *reason = "invalid character";
7306 /* 4=maximum length of a UTF-8 sequence */
7307 char buffer[4];
7308 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7309 Py_ssize_t outsize;
7310 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 PyObject *errorHandler = NULL;
7312 PyObject *exc = NULL;
7313 PyObject *encoding_obj = NULL;
7314 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 PyObject *rep;
7317 int ret = -1;
7318
7319 assert(insize > 0);
7320
7321 encoding = code_page_name(code_page, &encoding_obj);
7322 if (encoding == NULL)
7323 return -1;
7324
7325 if (errors == NULL || strcmp(errors, "strict") == 0) {
7326 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7327 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007328 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 if (exc != NULL) {
7330 PyCodec_StrictErrors(exc);
7331 Py_DECREF(exc);
7332 }
7333 Py_XDECREF(encoding_obj);
7334 return -1;
7335 }
7336
7337 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7338 pusedDefaultChar = &usedDefaultChar;
7339 else
7340 pusedDefaultChar = NULL;
7341
7342 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7343 PyErr_NoMemory();
7344 goto error;
7345 }
7346 outsize = insize * Py_ARRAY_LENGTH(buffer);
7347
7348 if (*outbytes == NULL) {
7349 /* Create string object */
7350 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7351 if (*outbytes == NULL)
7352 goto error;
7353 out = PyBytes_AS_STRING(*outbytes);
7354 }
7355 else {
7356 /* Extend string object */
7357 Py_ssize_t n = PyBytes_Size(*outbytes);
7358 if (n > PY_SSIZE_T_MAX - outsize) {
7359 PyErr_NoMemory();
7360 goto error;
7361 }
7362 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7363 goto error;
7364 out = PyBytes_AS_STRING(*outbytes) + n;
7365 }
7366
7367 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007370 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7371 wchar_t chars[2];
7372 int charsize;
7373 if (ch < 0x10000) {
7374 chars[0] = (wchar_t)ch;
7375 charsize = 1;
7376 }
7377 else {
7378 ch -= 0x10000;
7379 chars[0] = 0xd800 + (ch >> 10);
7380 chars[1] = 0xdc00 + (ch & 0x3ff);
7381 charsize = 2;
7382 }
7383
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007385 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 buffer, Py_ARRAY_LENGTH(buffer),
7387 NULL, pusedDefaultChar);
7388 if (outsize > 0) {
7389 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7390 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007391 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 memcpy(out, buffer, outsize);
7393 out += outsize;
7394 continue;
7395 }
7396 }
7397 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7398 PyErr_SetFromWindowsErr(0);
7399 goto error;
7400 }
7401
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 rep = unicode_encode_call_errorhandler(
7403 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007404 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007405 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (rep == NULL)
7407 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007408 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007409
7410 if (PyBytes_Check(rep)) {
7411 outsize = PyBytes_GET_SIZE(rep);
7412 if (outsize != 1) {
7413 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7414 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7415 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7416 Py_DECREF(rep);
7417 goto error;
7418 }
7419 out = PyBytes_AS_STRING(*outbytes) + offset;
7420 }
7421 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7422 out += outsize;
7423 }
7424 else {
7425 Py_ssize_t i;
7426 enum PyUnicode_Kind kind;
7427 void *data;
7428
7429 if (PyUnicode_READY(rep) < 0) {
7430 Py_DECREF(rep);
7431 goto error;
7432 }
7433
7434 outsize = PyUnicode_GET_LENGTH(rep);
7435 if (outsize != 1) {
7436 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7437 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7438 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7439 Py_DECREF(rep);
7440 goto error;
7441 }
7442 out = PyBytes_AS_STRING(*outbytes) + offset;
7443 }
7444 kind = PyUnicode_KIND(rep);
7445 data = PyUnicode_DATA(rep);
7446 for (i=0; i < outsize; i++) {
7447 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7448 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007449 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 encoding, unicode,
7451 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 "unable to encode error handler result to ASCII");
7453 Py_DECREF(rep);
7454 goto error;
7455 }
7456 *out = (unsigned char)ch;
7457 out++;
7458 }
7459 }
7460 Py_DECREF(rep);
7461 }
7462 /* write a NUL byte */
7463 *out = 0;
7464 outsize = out - PyBytes_AS_STRING(*outbytes);
7465 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7466 if (_PyBytes_Resize(outbytes, outsize) < 0)
7467 goto error;
7468 ret = 0;
7469
7470error:
7471 Py_XDECREF(encoding_obj);
7472 Py_XDECREF(errorHandler);
7473 Py_XDECREF(exc);
7474 return ret;
7475}
7476
Victor Stinner3a50e702011-10-18 21:21:00 +02007477static PyObject *
7478encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 const char *errors)
7481{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007484 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007486
Victor Stinner2fc507f2011-11-04 20:06:39 +01007487 if (PyUnicode_READY(unicode) < 0)
7488 return NULL;
7489 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007490
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 if (code_page < 0) {
7492 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7493 return NULL;
7494 }
7495
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 return PyBytes_FromStringAndSize(NULL, 0);
7498
Victor Stinner7581cef2011-11-03 22:32:33 +01007499 offset = 0;
7500 do
7501 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007504 chunks. */
7505 if (len > INT_MAX/2) {
7506 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 done = 0;
7508 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007509 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007511 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 done = 1;
7514 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 errors);
7519 if (ret == -2)
7520 ret = encode_code_page_errors(code_page, &outbytes,
7521 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007522 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 if (ret < 0) {
7524 Py_XDECREF(outbytes);
7525 return NULL;
7526 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007530 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 return outbytes;
7533}
7534
7535PyObject *
7536PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7537 Py_ssize_t size,
7538 const char *errors)
7539{
Victor Stinner7581cef2011-11-03 22:32:33 +01007540 PyObject *unicode, *res;
7541 unicode = PyUnicode_FromUnicode(p, size);
7542 if (unicode == NULL)
7543 return NULL;
7544 res = encode_code_page(CP_ACP, unicode, errors);
7545 Py_DECREF(unicode);
7546 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547}
7548
7549PyObject *
7550PyUnicode_EncodeCodePage(int code_page,
7551 PyObject *unicode,
7552 const char *errors)
7553{
Victor Stinner7581cef2011-11-03 22:32:33 +01007554 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007555}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007556
Alexander Belopolsky40018472011-02-26 01:02:56 +00007557PyObject *
7558PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007559{
7560 if (!PyUnicode_Check(unicode)) {
7561 PyErr_BadArgument();
7562 return NULL;
7563 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007564 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007565}
7566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567#undef NEED_RETRY
7568
Victor Stinner99b95382011-07-04 14:23:54 +02007569#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007570
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571/* --- Character Mapping Codec -------------------------------------------- */
7572
Alexander Belopolsky40018472011-02-26 01:02:56 +00007573PyObject *
7574PyUnicode_DecodeCharmap(const char *s,
7575 Py_ssize_t size,
7576 PyObject *mapping,
7577 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t startinpos;
7581 Py_ssize_t endinpos;
7582 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007584 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 PyObject *errorHandler = NULL;
7587 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007588
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 /* Default to Latin-1 */
7590 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007597 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007598 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007600 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007601 Py_ssize_t maplen;
7602 enum PyUnicode_Kind kind;
7603 void *data;
7604 Py_UCS4 x;
7605
7606 if (PyUnicode_READY(mapping) < 0)
7607 return NULL;
7608
7609 maplen = PyUnicode_GET_LENGTH(mapping);
7610 data = PyUnicode_DATA(mapping);
7611 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 while (s < e) {
7613 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007616 x = PyUnicode_READ(kind, data, ch);
7617 else
7618 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007620 if (x == 0xfffe)
7621 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 startinpos = s-starts;
7624 endinpos = startinpos+1;
7625 if (unicode_decode_call_errorhandler(
7626 errors, &errorHandler,
7627 "charmap", "character maps to <undefined>",
7628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007629 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 goto onError;
7631 }
7632 continue;
7633 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007634
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007635 if (unicode_putchar(&v, &outpos, x) < 0)
7636 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007639 }
7640 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 while (s < e) {
7642 unsigned char ch = *s;
7643 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007644
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7646 w = PyLong_FromLong((long)ch);
7647 if (w == NULL)
7648 goto onError;
7649 x = PyObject_GetItem(mapping, w);
7650 Py_DECREF(w);
7651 if (x == NULL) {
7652 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7653 /* No mapping found means: mapping is undefined. */
7654 PyErr_Clear();
7655 x = Py_None;
7656 Py_INCREF(x);
7657 } else
7658 goto onError;
7659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 /* Apply mapping */
7662 if (PyLong_Check(x)) {
7663 long value = PyLong_AS_LONG(x);
7664 if (value < 0 || value > 65535) {
7665 PyErr_SetString(PyExc_TypeError,
7666 "character mapping must be in range(65536)");
7667 Py_DECREF(x);
7668 goto onError;
7669 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007670 if (unicode_putchar(&v, &outpos, value) < 0)
7671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 }
7673 else if (x == Py_None) {
7674 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 startinpos = s-starts;
7676 endinpos = startinpos+1;
7677 if (unicode_decode_call_errorhandler(
7678 errors, &errorHandler,
7679 "charmap", "character maps to <undefined>",
7680 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007681 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 Py_DECREF(x);
7683 goto onError;
7684 }
7685 Py_DECREF(x);
7686 continue;
7687 }
7688 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007689 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007691 if (PyUnicode_READY(x) < 0)
7692 goto onError;
7693 targetsize = PyUnicode_GET_LENGTH(x);
7694
7695 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007697 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 PyUnicode_READ_CHAR(x, 0)) < 0)
7699 goto onError;
7700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 else if (targetsize > 1) {
7702 /* 1-n mapping */
7703 if (targetsize > extrachars) {
7704 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 Py_ssize_t needed = (targetsize - extrachars) + \
7706 (targetsize << 2);
7707 extrachars += needed;
7708 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007709 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 Py_DECREF(x);
7712 goto onError;
7713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007715 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7716 goto onError;
7717 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7718 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 extrachars -= targetsize;
7720 }
7721 /* 1-0 mapping: skip the character */
7722 }
7723 else {
7724 /* wrong return value */
7725 PyErr_SetString(PyExc_TypeError,
7726 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 Py_DECREF(x);
7728 goto onError;
7729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 Py_DECREF(x);
7731 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007734 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 Py_XDECREF(errorHandler);
7737 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007738 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007739
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 Py_XDECREF(errorHandler);
7742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 Py_XDECREF(v);
7744 return NULL;
7745}
7746
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747/* Charmap encoding: the lookup table */
7748
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 PyObject_HEAD
7751 unsigned char level1[32];
7752 int count2, count3;
7753 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754};
7755
7756static PyObject*
7757encoding_map_size(PyObject *obj, PyObject* args)
7758{
7759 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762}
7763
7764static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007765 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 PyDoc_STR("Return the size (in bytes) of this object") },
7767 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768};
7769
7770static void
7771encoding_map_dealloc(PyObject* o)
7772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774}
7775
7776static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 "EncodingMap", /*tp_name*/
7779 sizeof(struct encoding_map), /*tp_basicsize*/
7780 0, /*tp_itemsize*/
7781 /* methods */
7782 encoding_map_dealloc, /*tp_dealloc*/
7783 0, /*tp_print*/
7784 0, /*tp_getattr*/
7785 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007786 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 0, /*tp_repr*/
7788 0, /*tp_as_number*/
7789 0, /*tp_as_sequence*/
7790 0, /*tp_as_mapping*/
7791 0, /*tp_hash*/
7792 0, /*tp_call*/
7793 0, /*tp_str*/
7794 0, /*tp_getattro*/
7795 0, /*tp_setattro*/
7796 0, /*tp_as_buffer*/
7797 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7798 0, /*tp_doc*/
7799 0, /*tp_traverse*/
7800 0, /*tp_clear*/
7801 0, /*tp_richcompare*/
7802 0, /*tp_weaklistoffset*/
7803 0, /*tp_iter*/
7804 0, /*tp_iternext*/
7805 encoding_map_methods, /*tp_methods*/
7806 0, /*tp_members*/
7807 0, /*tp_getset*/
7808 0, /*tp_base*/
7809 0, /*tp_dict*/
7810 0, /*tp_descr_get*/
7811 0, /*tp_descr_set*/
7812 0, /*tp_dictoffset*/
7813 0, /*tp_init*/
7814 0, /*tp_alloc*/
7815 0, /*tp_new*/
7816 0, /*tp_free*/
7817 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818};
7819
7820PyObject*
7821PyUnicode_BuildEncodingMap(PyObject* string)
7822{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 PyObject *result;
7824 struct encoding_map *mresult;
7825 int i;
7826 int need_dict = 0;
7827 unsigned char level1[32];
7828 unsigned char level2[512];
7829 unsigned char *mlevel1, *mlevel2, *mlevel3;
7830 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831 int kind;
7832 void *data;
7833 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 PyErr_BadArgument();
7837 return NULL;
7838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 kind = PyUnicode_KIND(string);
7840 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 memset(level1, 0xFF, sizeof level1);
7842 memset(level2, 0xFF, sizeof level2);
7843
7844 /* If there isn't a one-to-one mapping of NULL to \0,
7845 or if there are non-BMP characters, we need to use
7846 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 need_dict = 1;
7849 for (i = 1; i < 256; i++) {
7850 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 ch = PyUnicode_READ(kind, data, i);
7852 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 need_dict = 1;
7854 break;
7855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 /* unmapped character */
7858 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 l1 = ch >> 11;
7860 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 if (level1[l1] == 0xFF)
7862 level1[l1] = count2++;
7863 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 }
7866
7867 if (count2 >= 0xFF || count3 >= 0xFF)
7868 need_dict = 1;
7869
7870 if (need_dict) {
7871 PyObject *result = PyDict_New();
7872 PyObject *key, *value;
7873 if (!result)
7874 return NULL;
7875 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007877 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 if (!key || !value)
7879 goto failed1;
7880 if (PyDict_SetItem(result, key, value) == -1)
7881 goto failed1;
7882 Py_DECREF(key);
7883 Py_DECREF(value);
7884 }
7885 return result;
7886 failed1:
7887 Py_XDECREF(key);
7888 Py_XDECREF(value);
7889 Py_DECREF(result);
7890 return NULL;
7891 }
7892
7893 /* Create a three-level trie */
7894 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7895 16*count2 + 128*count3 - 1);
7896 if (!result)
7897 return PyErr_NoMemory();
7898 PyObject_Init(result, &EncodingMapType);
7899 mresult = (struct encoding_map*)result;
7900 mresult->count2 = count2;
7901 mresult->count3 = count3;
7902 mlevel1 = mresult->level1;
7903 mlevel2 = mresult->level23;
7904 mlevel3 = mresult->level23 + 16*count2;
7905 memcpy(mlevel1, level1, 32);
7906 memset(mlevel2, 0xFF, 16*count2);
7907 memset(mlevel3, 0, 128*count3);
7908 count3 = 0;
7909 for (i = 1; i < 256; i++) {
7910 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007911 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912 /* unmapped character */
7913 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 o1 = PyUnicode_READ(kind, data, i)>>11;
7915 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 i2 = 16*mlevel1[o1] + o2;
7917 if (mlevel2[i2] == 0xFF)
7918 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 i3 = 128*mlevel2[i2] + o3;
7921 mlevel3[i3] = i;
7922 }
7923 return result;
7924}
7925
7926static int
Victor Stinner22168992011-11-20 17:09:18 +01007927encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928{
7929 struct encoding_map *map = (struct encoding_map*)mapping;
7930 int l1 = c>>11;
7931 int l2 = (c>>7) & 0xF;
7932 int l3 = c & 0x7F;
7933 int i;
7934
Victor Stinner22168992011-11-20 17:09:18 +01007935 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007937 if (c == 0)
7938 return 0;
7939 /* level 1*/
7940 i = map->level1[l1];
7941 if (i == 0xFF) {
7942 return -1;
7943 }
7944 /* level 2*/
7945 i = map->level23[16*i+l2];
7946 if (i == 0xFF) {
7947 return -1;
7948 }
7949 /* level 3 */
7950 i = map->level23[16*map->count2 + 128*i + l3];
7951 if (i == 0) {
7952 return -1;
7953 }
7954 return i;
7955}
7956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957/* Lookup the character ch in the mapping. If the character
7958 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007959 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007960static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007961charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
Christian Heimes217cfd12007-12-02 14:31:20 +00007963 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 PyObject *x;
7965
7966 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 x = PyObject_GetItem(mapping, w);
7969 Py_DECREF(w);
7970 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7972 /* No mapping found means: mapping is undefined. */
7973 PyErr_Clear();
7974 x = Py_None;
7975 Py_INCREF(x);
7976 return x;
7977 } else
7978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007980 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007982 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 long value = PyLong_AS_LONG(x);
7984 if (value < 0 || value > 255) {
7985 PyErr_SetString(PyExc_TypeError,
7986 "character mapping must be in range(256)");
7987 Py_DECREF(x);
7988 return NULL;
7989 }
7990 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007992 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* wrong return value */
7996 PyErr_Format(PyExc_TypeError,
7997 "character mapping must return integer, bytes or None, not %.400s",
7998 x->ob_type->tp_name);
7999 Py_DECREF(x);
8000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 }
8002}
8003
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008005charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008006{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8008 /* exponentially overallocate to minimize reallocations */
8009 if (requiredsize < 2*outsize)
8010 requiredsize = 2*outsize;
8011 if (_PyBytes_Resize(outobj, requiredsize))
8012 return -1;
8013 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014}
8015
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008018} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008019/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008020 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 space is available. Return a new reference to the object that
8022 was put in the output buffer, or Py_None, if the mapping was undefined
8023 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008024 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008025static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008026charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008027 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 PyObject *rep;
8030 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032
Christian Heimes90aa7642007-12-19 02:45:37 +00008033 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 if (res == -1)
8037 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 if (outsize<requiredsize)
8039 if (charmapencode_resize(outobj, outpos, requiredsize))
8040 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008041 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 outstart[(*outpos)++] = (char)res;
8043 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 }
8045
8046 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008047 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 Py_DECREF(rep);
8051 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if (PyLong_Check(rep)) {
8054 Py_ssize_t requiredsize = *outpos+1;
8055 if (outsize<requiredsize)
8056 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8057 Py_DECREF(rep);
8058 return enc_EXCEPTION;
8059 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008060 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 else {
8064 const char *repchars = PyBytes_AS_STRING(rep);
8065 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8066 Py_ssize_t requiredsize = *outpos+repsize;
8067 if (outsize<requiredsize)
8068 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8069 Py_DECREF(rep);
8070 return enc_EXCEPTION;
8071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 memcpy(outstart + *outpos, repchars, repsize);
8074 *outpos += repsize;
8075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 Py_DECREF(rep);
8078 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079}
8080
8081/* handle an error in PyUnicode_EncodeCharmap
8082 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008083static int
8084charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008087 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008088 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089{
8090 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008092 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008093 enum PyUnicode_Kind kind;
8094 void *data;
8095 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t collstartpos = *inpos;
8098 Py_ssize_t collendpos = *inpos+1;
8099 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 char *encoding = "charmap";
8101 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008104 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008106 if (PyUnicode_READY(unicode) < 0)
8107 return -1;
8108 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 /* find all unencodable characters */
8110 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008112 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008113 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008114 val = encoding_map_lookup(ch, mapping);
8115 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 break;
8117 ++collendpos;
8118 continue;
8119 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008120
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008121 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8122 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 if (rep==NULL)
8124 return -1;
8125 else if (rep!=Py_None) {
8126 Py_DECREF(rep);
8127 break;
8128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 }
8132 /* cache callback name lookup
8133 * (if not done yet, i.e. it's the first error) */
8134 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if ((errors==NULL) || (!strcmp(errors, "strict")))
8136 *known_errorHandler = 1;
8137 else if (!strcmp(errors, "replace"))
8138 *known_errorHandler = 2;
8139 else if (!strcmp(errors, "ignore"))
8140 *known_errorHandler = 3;
8141 else if (!strcmp(errors, "xmlcharrefreplace"))
8142 *known_errorHandler = 4;
8143 else
8144 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 }
8146 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008148 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 return -1;
8150 case 2: /* replace */
8151 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 x = charmapencode_output('?', mapping, res, respos);
8153 if (x==enc_EXCEPTION) {
8154 return -1;
8155 }
8156 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008157 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return -1;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 }
8161 /* fall through */
8162 case 3: /* ignore */
8163 *inpos = collendpos;
8164 break;
8165 case 4: /* xmlcharrefreplace */
8166 /* generate replacement (temporarily (mis)uses p) */
8167 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 char buffer[2+29+1+1];
8169 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008170 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 for (cp = buffer; *cp; ++cp) {
8172 x = charmapencode_output(*cp, mapping, res, respos);
8173 if (x==enc_EXCEPTION)
8174 return -1;
8175 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008176 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return -1;
8178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 }
8180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 *inpos = collendpos;
8182 break;
8183 default:
8184 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008185 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008189 if (PyBytes_Check(repunicode)) {
8190 /* Directly copy bytes result to output. */
8191 Py_ssize_t outsize = PyBytes_Size(*res);
8192 Py_ssize_t requiredsize;
8193 repsize = PyBytes_Size(repunicode);
8194 requiredsize = *respos + repsize;
8195 if (requiredsize > outsize)
8196 /* Make room for all additional bytes. */
8197 if (charmapencode_resize(res, respos, requiredsize)) {
8198 Py_DECREF(repunicode);
8199 return -1;
8200 }
8201 memcpy(PyBytes_AsString(*res) + *respos,
8202 PyBytes_AsString(repunicode), repsize);
8203 *respos += repsize;
8204 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008205 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008206 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008209 if (PyUnicode_READY(repunicode) < 0) {
8210 Py_DECREF(repunicode);
8211 return -1;
8212 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008213 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008214 data = PyUnicode_DATA(repunicode);
8215 kind = PyUnicode_KIND(repunicode);
8216 for (index = 0; index < repsize; index++) {
8217 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8218 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008220 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
8222 }
8223 else if (x==enc_FAILED) {
8224 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008225 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return -1;
8227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 }
8229 *inpos = newpos;
8230 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 }
8232 return 0;
8233}
8234
Alexander Belopolsky40018472011-02-26 01:02:56 +00008235PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008236_PyUnicode_EncodeCharmap(PyObject *unicode,
8237 PyObject *mapping,
8238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 /* output object */
8241 PyObject *res = NULL;
8242 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008243 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008246 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *errorHandler = NULL;
8248 PyObject *exc = NULL;
8249 /* the following variable is used for caching string comparisons
8250 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8251 * 3=ignore, 4=xmlcharrefreplace */
8252 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008254 if (PyUnicode_READY(unicode) < 0)
8255 return NULL;
8256 size = PyUnicode_GET_LENGTH(unicode);
8257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 /* Default to Latin-1 */
8259 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 /* allocate enough for a simple encoding without
8263 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008264 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 if (res == NULL)
8266 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008267 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008271 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008273 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 if (x==enc_EXCEPTION) /* error */
8275 goto onError;
8276 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 &exc,
8279 &known_errorHandler, &errorHandler, errors,
8280 &res, &respos)) {
8281 goto onError;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 else
8285 /* done with this character => adjust input position */
8286 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008291 if (_PyBytes_Resize(&res, respos) < 0)
8292 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 Py_XDECREF(exc);
8295 Py_XDECREF(errorHandler);
8296 return res;
8297
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 Py_XDECREF(res);
8300 Py_XDECREF(exc);
8301 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
8303}
8304
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008305/* Deprecated */
8306PyObject *
8307PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8308 Py_ssize_t size,
8309 PyObject *mapping,
8310 const char *errors)
8311{
8312 PyObject *result;
8313 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8314 if (unicode == NULL)
8315 return NULL;
8316 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8317 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008318 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008319}
8320
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321PyObject *
8322PyUnicode_AsCharmapString(PyObject *unicode,
8323 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
8325 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 PyErr_BadArgument();
8327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008329 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static void
8334make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008336 Py_ssize_t startpos, Py_ssize_t endpos,
8337 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 *exceptionObject = _PyUnicodeTranslateError_Create(
8341 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
8343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8345 goto onError;
8346 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8347 goto onError;
8348 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8349 goto onError;
8350 return;
8351 onError:
8352 Py_DECREF(*exceptionObject);
8353 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355}
8356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358static void
8359raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361 Py_ssize_t startpos, Py_ssize_t endpos,
8362 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363{
8364 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368}
8369
8370/* error handling callback helper:
8371 build arguments, call the callback and check the arguments,
8372 put the result into newpos and return the replacement string, which
8373 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374static PyObject *
8375unicode_translate_call_errorhandler(const char *errors,
8376 PyObject **errorHandler,
8377 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008379 Py_ssize_t startpos, Py_ssize_t endpos,
8380 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008382 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008384 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject *restuple;
8386 PyObject *resunicode;
8387
8388 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 }
8393
8394 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398
8399 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008404 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 Py_DECREF(restuple);
8406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 }
8408 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 &resunicode, &i_newpos)) {
8410 Py_DECREF(restuple);
8411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008413 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008415 else
8416 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8419 Py_DECREF(restuple);
8420 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 Py_INCREF(resunicode);
8423 Py_DECREF(restuple);
8424 return resunicode;
8425}
8426
8427/* Lookup the character ch in the mapping and put the result in result,
8428 which must be decrefed by the caller.
8429 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008430static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432{
Christian Heimes217cfd12007-12-02 14:31:20 +00008433 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 PyObject *x;
8435
8436 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 x = PyObject_GetItem(mapping, w);
8439 Py_DECREF(w);
8440 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8442 /* No mapping found means: use 1:1 mapping. */
8443 PyErr_Clear();
8444 *result = NULL;
8445 return 0;
8446 } else
8447 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 }
8449 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 *result = x;
8451 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008453 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 long value = PyLong_AS_LONG(x);
8455 long max = PyUnicode_GetMax();
8456 if (value < 0 || value > max) {
8457 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008458 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 Py_DECREF(x);
8460 return -1;
8461 }
8462 *result = x;
8463 return 0;
8464 }
8465 else if (PyUnicode_Check(x)) {
8466 *result = x;
8467 return 0;
8468 }
8469 else {
8470 /* wrong return value */
8471 PyErr_SetString(PyExc_TypeError,
8472 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 Py_DECREF(x);
8474 return -1;
8475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476}
8477/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if not reallocate and adjust various state variables.
8479 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008485 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 /* exponentially overallocate to minimize reallocations */
8487 if (requiredsize < 2 * oldsize)
8488 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8490 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
8494 return 0;
8495}
8496/* lookup the character, put the result in the output string and adjust
8497 various state variables. Return a new reference to the object that
8498 was put in the output buffer in *result, or Py_None, if the mapping was
8499 undefined (in which case no character was written).
8500 The called must decref result.
8501 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8504 PyObject *mapping, Py_UCS4 **output,
8505 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8509 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 }
8515 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008517 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 }
8521 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 Py_ssize_t repsize;
8523 if (PyUnicode_READY(*res) == -1)
8524 return -1;
8525 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 if (repsize==1) {
8527 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 }
8530 else if (repsize!=0) {
8531 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 Py_ssize_t requiredsize = *opos +
8533 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 Py_ssize_t i;
8536 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 for(i = 0; i < repsize; i++)
8539 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 }
8542 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 return 0;
8545}
8546
Alexander Belopolsky40018472011-02-26 01:02:56 +00008547PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548_PyUnicode_TranslateCharmap(PyObject *input,
8549 PyObject *mapping,
8550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 /* input object */
8553 char *idata;
8554 Py_ssize_t size, i;
8555 int kind;
8556 /* output buffer */
8557 Py_UCS4 *output = NULL;
8558 Py_ssize_t osize;
8559 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 char *reason = "character maps to <undefined>";
8563 PyObject *errorHandler = NULL;
8564 PyObject *exc = NULL;
8565 /* the following variable is used for caching string comparisons
8566 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8567 * 3=ignore, 4=xmlcharrefreplace */
8568 int known_errorHandler = -1;
8569
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 PyErr_BadArgument();
8572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 if (PyUnicode_READY(input) == -1)
8576 return NULL;
8577 idata = (char*)PyUnicode_DATA(input);
8578 kind = PyUnicode_KIND(input);
8579 size = PyUnicode_GET_LENGTH(input);
8580 i = 0;
8581
8582 if (size == 0) {
8583 Py_INCREF(input);
8584 return input;
8585 }
8586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* allocate enough for a simple 1:1 translation without
8588 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 osize = size;
8590 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8591 opos = 0;
8592 if (output == NULL) {
8593 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 /* try to encode it */
8599 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 if (charmaptranslate_output(input, i, mapping,
8601 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 Py_XDECREF(x);
8603 goto onError;
8604 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008605 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 else { /* untranslatable character */
8609 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8610 Py_ssize_t repsize;
8611 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 Py_ssize_t collstart = i;
8615 Py_ssize_t collend = i+1;
8616 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 while (collend < size) {
8620 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 goto onError;
8622 Py_XDECREF(x);
8623 if (x!=Py_None)
8624 break;
8625 ++collend;
8626 }
8627 /* cache callback name lookup
8628 * (if not done yet, i.e. it's the first error) */
8629 if (known_errorHandler==-1) {
8630 if ((errors==NULL) || (!strcmp(errors, "strict")))
8631 known_errorHandler = 1;
8632 else if (!strcmp(errors, "replace"))
8633 known_errorHandler = 2;
8634 else if (!strcmp(errors, "ignore"))
8635 known_errorHandler = 3;
8636 else if (!strcmp(errors, "xmlcharrefreplace"))
8637 known_errorHandler = 4;
8638 else
8639 known_errorHandler = 0;
8640 }
8641 switch (known_errorHandler) {
8642 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 raise_translate_exception(&exc, input, collstart,
8644 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 case 2: /* replace */
8647 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 for (coll = collstart; coll<collend; coll++)
8649 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* fall through */
8651 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 break;
8654 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 /* generate replacement (temporarily (mis)uses i) */
8656 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 char buffer[2+29+1+1];
8658 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8660 if (charmaptranslate_makespace(&output, &osize,
8661 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 goto onError;
8663 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 break;
8668 default:
8669 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 reason, input, &exc,
8671 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008672 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008674 if (PyUnicode_READY(repunicode) < 0) {
8675 Py_DECREF(repunicode);
8676 goto onError;
8677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 repsize = PyUnicode_GET_LENGTH(repunicode);
8680 if (charmaptranslate_makespace(&output, &osize,
8681 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 Py_DECREF(repunicode);
8683 goto onError;
8684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 for (uni2 = 0; repsize-->0; ++uni2)
8686 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8687 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 }
8691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8693 if (!res)
8694 goto onError;
8695 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 Py_XDECREF(exc);
8697 Py_XDECREF(errorHandler);
8698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_XDECREF(exc);
8703 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 return NULL;
8705}
8706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707/* Deprecated. Use PyUnicode_Translate instead. */
8708PyObject *
8709PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8710 Py_ssize_t size,
8711 PyObject *mapping,
8712 const char *errors)
8713{
8714 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8715 if (!unicode)
8716 return NULL;
8717 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
8721PyUnicode_Translate(PyObject *str,
8722 PyObject *mapping,
8723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
8725 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008726
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 str = PyUnicode_FromObject(str);
8728 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_DECREF(str);
8732 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008733
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 Py_XDECREF(str);
8736 return NULL;
8737}
Tim Petersced69f82003-09-16 20:30:58 +00008738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008740fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741{
8742 /* No need to call PyUnicode_READY(self) because this function is only
8743 called as a callback from fixup() which does it already. */
8744 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8745 const int kind = PyUnicode_KIND(self);
8746 void *data = PyUnicode_DATA(self);
8747 Py_UCS4 maxchar = 0, ch, fixed;
8748 Py_ssize_t i;
8749
8750 for (i = 0; i < len; ++i) {
8751 ch = PyUnicode_READ(kind, data, i);
8752 fixed = 0;
8753 if (ch > 127) {
8754 if (Py_UNICODE_ISSPACE(ch))
8755 fixed = ' ';
8756 else {
8757 const int decimal = Py_UNICODE_TODECIMAL(ch);
8758 if (decimal >= 0)
8759 fixed = '0' + decimal;
8760 }
8761 if (fixed != 0) {
8762 if (fixed > maxchar)
8763 maxchar = fixed;
8764 PyUnicode_WRITE(kind, data, i, fixed);
8765 }
8766 else if (ch > maxchar)
8767 maxchar = ch;
8768 }
8769 else if (ch > maxchar)
8770 maxchar = ch;
8771 }
8772
8773 return maxchar;
8774}
8775
8776PyObject *
8777_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8778{
8779 if (!PyUnicode_Check(unicode)) {
8780 PyErr_BadInternalCall();
8781 return NULL;
8782 }
8783 if (PyUnicode_READY(unicode) == -1)
8784 return NULL;
8785 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8786 /* If the string is already ASCII, just return the same string */
8787 Py_INCREF(unicode);
8788 return unicode;
8789 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008790 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791}
8792
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008793PyObject *
8794PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8795 Py_ssize_t length)
8796{
Victor Stinnerf0124502011-11-21 23:12:56 +01008797 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008798 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008799 Py_UCS4 maxchar;
8800 enum PyUnicode_Kind kind;
8801 void *data;
8802
8803 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008804 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008805 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008806 if (ch > 127) {
8807 int decimal = Py_UNICODE_TODECIMAL(ch);
8808 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008809 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008810 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008811 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008812 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008813
8814 /* Copy to a new string */
8815 decimal = PyUnicode_New(length, maxchar);
8816 if (decimal == NULL)
8817 return decimal;
8818 kind = PyUnicode_KIND(decimal);
8819 data = PyUnicode_DATA(decimal);
8820 /* Iterate over code points */
8821 for (i = 0; i < length; i++) {
8822 Py_UNICODE ch = s[i];
8823 if (ch > 127) {
8824 int decimal = Py_UNICODE_TODECIMAL(ch);
8825 if (decimal >= 0)
8826 ch = '0' + decimal;
8827 }
8828 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008830 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008831}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008832/* --- Decimal Encoder ---------------------------------------------------- */
8833
Alexander Belopolsky40018472011-02-26 01:02:56 +00008834int
8835PyUnicode_EncodeDecimal(Py_UNICODE *s,
8836 Py_ssize_t length,
8837 char *output,
8838 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008839{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008840 PyObject *errorHandler = NULL;
8841 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008842 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 const char *encoding = "decimal";
8844 const char *reason = "invalid decimal Unicode string";
8845 /* the following variable is used for caching string comparisons
8846 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8847 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008848 Py_ssize_t i, j;
8849 enum PyUnicode_Kind kind;
8850 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008851
8852 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 PyErr_BadArgument();
8854 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008855 }
8856
Victor Stinner42bf7752011-11-21 22:52:58 +01008857 unicode = PyUnicode_FromUnicode(s, length);
8858 if (unicode == NULL)
8859 return -1;
8860
8861 if (PyUnicode_READY(unicode) < 0)
8862 goto onError;
8863 kind = PyUnicode_KIND(unicode);
8864 data = PyUnicode_DATA(unicode);
8865
Victor Stinnerb84d7232011-11-22 01:50:07 +01008866 for (i=0; i < length; ) {
Victor Stinner42bf7752011-11-21 22:52:58 +01008867 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008869 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008870
Benjamin Peterson29060642009-01-31 22:14:21 +00008871 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008873 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008875 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 decimal = Py_UNICODE_TODECIMAL(ch);
8877 if (decimal >= 0) {
8878 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008879 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 continue;
8881 }
8882 if (0 < ch && ch < 256) {
8883 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008884 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 continue;
8886 }
8887 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008888 startpos = i;
8889 endpos = i+1;
8890 for (; endpos < length; endpos++) {
8891 ch = PyUnicode_READ(kind, data, endpos);
8892 if ((0 < ch && ch < 256) ||
Victor Stinnerb84d7232011-11-22 01:50:07 +01008893 Py_UNICODE_ISSPACE(ch) ||
8894 0 <= Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 break;
8896 }
8897 /* cache callback name lookup
8898 * (if not done yet, i.e. it's the first error) */
8899 if (known_errorHandler==-1) {
8900 if ((errors==NULL) || (!strcmp(errors, "strict")))
8901 known_errorHandler = 1;
8902 else if (!strcmp(errors, "replace"))
8903 known_errorHandler = 2;
8904 else if (!strcmp(errors, "ignore"))
8905 known_errorHandler = 3;
8906 else if (!strcmp(errors, "xmlcharrefreplace"))
8907 known_errorHandler = 4;
8908 else
8909 known_errorHandler = 0;
8910 }
8911 switch (known_errorHandler) {
8912 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008913 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 goto onError;
8915 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008916 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 *output++ = '?';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008918 i = endpos;
8919 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008921 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 break;
8923 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008924 /* generate replacement */
8925 for (j=startpos; j < endpos; j++) {
8926 ch = PyUnicode_READ(kind, data, i);
8927 output += sprintf(output, "&#%d;", (int)ch);
8928 i++;
8929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 break;
8931 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008932 {
8933 PyObject *repunicode;
8934 Py_ssize_t repsize, newpos, k;
8935 enum PyUnicode_Kind repkind;
8936 void *repdata;
8937
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008939 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008940 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 if (repunicode == NULL)
8942 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008943 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008944 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008945 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8946 Py_DECREF(repunicode);
8947 goto onError;
8948 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008949 if (PyUnicode_READY(repunicode) < 0) {
8950 Py_DECREF(repunicode);
8951 goto onError;
8952 }
8953 repkind = PyUnicode_KIND(repunicode);
8954 repdata = PyUnicode_DATA(repunicode);
8955
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 /* generate replacement */
8957 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008958 for (k=0; k<repsize; k++) {
8959 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 if (Py_UNICODE_ISSPACE(ch))
8961 *output++ = ' ';
8962 else {
8963 decimal = Py_UNICODE_TODECIMAL(ch);
8964 if (decimal >= 0)
8965 *output++ = '0' + decimal;
8966 else if (0 < ch && ch < 256)
8967 *output++ = (char)ch;
8968 else {
8969 Py_DECREF(repunicode);
8970 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008971 unicode, startpos, endpos,
8972 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 goto onError;
8974 }
8975 }
8976 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008977 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 Py_DECREF(repunicode);
8979 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008980 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008981 }
8982 /* 0-terminate the output string */
8983 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984 Py_XDECREF(exc);
8985 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008986 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008987 return 0;
8988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990 Py_XDECREF(exc);
8991 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008992 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008993 return -1;
8994}
8995
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996/* --- Helpers ------------------------------------------------------------ */
8997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008999any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 Py_ssize_t start,
9001 Py_ssize_t end)
9002{
9003 int kind1, kind2, kind;
9004 void *buf1, *buf2;
9005 Py_ssize_t len1, len2, result;
9006
9007 kind1 = PyUnicode_KIND(s1);
9008 kind2 = PyUnicode_KIND(s2);
9009 kind = kind1 > kind2 ? kind1 : kind2;
9010 buf1 = PyUnicode_DATA(s1);
9011 buf2 = PyUnicode_DATA(s2);
9012 if (kind1 != kind)
9013 buf1 = _PyUnicode_AsKind(s1, kind);
9014 if (!buf1)
9015 return -2;
9016 if (kind2 != kind)
9017 buf2 = _PyUnicode_AsKind(s2, kind);
9018 if (!buf2) {
9019 if (kind1 != kind) PyMem_Free(buf1);
9020 return -2;
9021 }
9022 len1 = PyUnicode_GET_LENGTH(s1);
9023 len2 = PyUnicode_GET_LENGTH(s2);
9024
Victor Stinner794d5672011-10-10 03:21:36 +02009025 if (direction > 0) {
9026 switch(kind) {
9027 case PyUnicode_1BYTE_KIND:
9028 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9029 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9030 else
9031 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9035 break;
9036 case PyUnicode_4BYTE_KIND:
9037 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 default:
9040 assert(0); result = -2;
9041 }
9042 }
9043 else {
9044 switch(kind) {
9045 case PyUnicode_1BYTE_KIND:
9046 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9047 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9048 else
9049 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9050 break;
9051 case PyUnicode_2BYTE_KIND:
9052 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053 break;
9054 case PyUnicode_4BYTE_KIND:
9055 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9056 break;
9057 default:
9058 assert(0); result = -2;
9059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 }
9061
9062 if (kind1 != kind)
9063 PyMem_Free(buf1);
9064 if (kind2 != kind)
9065 PyMem_Free(buf2);
9066
9067 return result;
9068}
9069
9070Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009071_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 Py_ssize_t n_buffer,
9073 void *digits, Py_ssize_t n_digits,
9074 Py_ssize_t min_width,
9075 const char *grouping,
9076 const char *thousands_sep)
9077{
9078 switch(kind) {
9079 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009080 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9081 return _PyUnicode_ascii_InsertThousandsGrouping(
9082 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9083 min_width, grouping, thousands_sep);
9084 else
9085 return _PyUnicode_ucs1_InsertThousandsGrouping(
9086 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9087 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 case PyUnicode_2BYTE_KIND:
9089 return _PyUnicode_ucs2_InsertThousandsGrouping(
9090 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9091 min_width, grouping, thousands_sep);
9092 case PyUnicode_4BYTE_KIND:
9093 return _PyUnicode_ucs4_InsertThousandsGrouping(
9094 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9095 min_width, grouping, thousands_sep);
9096 }
9097 assert(0);
9098 return -1;
9099}
9100
9101
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009103#define ADJUST_INDICES(start, end, len) \
9104 if (end > len) \
9105 end = len; \
9106 else if (end < 0) { \
9107 end += len; \
9108 if (end < 0) \
9109 end = 0; \
9110 } \
9111 if (start < 0) { \
9112 start += len; \
9113 if (start < 0) \
9114 start = 0; \
9115 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116
Alexander Belopolsky40018472011-02-26 01:02:56 +00009117Py_ssize_t
9118PyUnicode_Count(PyObject *str,
9119 PyObject *substr,
9120 Py_ssize_t start,
9121 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009123 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009124 PyObject* str_obj;
9125 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 int kind1, kind2, kind;
9127 void *buf1 = NULL, *buf2 = NULL;
9128 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009129
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009130 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009133 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009134 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 Py_DECREF(str_obj);
9136 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 }
Tim Petersced69f82003-09-16 20:30:58 +00009138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 kind1 = PyUnicode_KIND(str_obj);
9140 kind2 = PyUnicode_KIND(sub_obj);
9141 kind = kind1 > kind2 ? kind1 : kind2;
9142 buf1 = PyUnicode_DATA(str_obj);
9143 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009144 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 if (!buf1)
9146 goto onError;
9147 buf2 = PyUnicode_DATA(sub_obj);
9148 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009149 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 if (!buf2)
9151 goto onError;
9152 len1 = PyUnicode_GET_LENGTH(str_obj);
9153 len2 = PyUnicode_GET_LENGTH(sub_obj);
9154
9155 ADJUST_INDICES(start, end, len1);
9156 switch(kind) {
9157 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009158 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9159 result = asciilib_count(
9160 ((Py_UCS1*)buf1) + start, end - start,
9161 buf2, len2, PY_SSIZE_T_MAX
9162 );
9163 else
9164 result = ucs1lib_count(
9165 ((Py_UCS1*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 break;
9169 case PyUnicode_2BYTE_KIND:
9170 result = ucs2lib_count(
9171 ((Py_UCS2*)buf1) + start, end - start,
9172 buf2, len2, PY_SSIZE_T_MAX
9173 );
9174 break;
9175 case PyUnicode_4BYTE_KIND:
9176 result = ucs4lib_count(
9177 ((Py_UCS4*)buf1) + start, end - start,
9178 buf2, len2, PY_SSIZE_T_MAX
9179 );
9180 break;
9181 default:
9182 assert(0); result = 0;
9183 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009184
9185 Py_DECREF(sub_obj);
9186 Py_DECREF(str_obj);
9187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 if (kind1 != kind)
9189 PyMem_Free(buf1);
9190 if (kind2 != kind)
9191 PyMem_Free(buf2);
9192
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 onError:
9195 Py_DECREF(sub_obj);
9196 Py_DECREF(str_obj);
9197 if (kind1 != kind && buf1)
9198 PyMem_Free(buf1);
9199 if (kind2 != kind && buf2)
9200 PyMem_Free(buf2);
9201 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202}
9203
Alexander Belopolsky40018472011-02-26 01:02:56 +00009204Py_ssize_t
9205PyUnicode_Find(PyObject *str,
9206 PyObject *sub,
9207 Py_ssize_t start,
9208 Py_ssize_t end,
9209 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009211 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009216 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 Py_DECREF(str);
9219 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 }
Tim Petersced69f82003-09-16 20:30:58 +00009221
Victor Stinner794d5672011-10-10 03:21:36 +02009222 result = any_find_slice(direction,
9223 str, sub, start, end
9224 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 Py_DECREF(sub);
9228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 return result;
9230}
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232Py_ssize_t
9233PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9234 Py_ssize_t start, Py_ssize_t end,
9235 int direction)
9236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009238 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 if (PyUnicode_READY(str) == -1)
9240 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009241 if (start < 0 || end < 0) {
9242 PyErr_SetString(PyExc_IndexError, "string index out of range");
9243 return -2;
9244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (end > PyUnicode_GET_LENGTH(str))
9246 end = PyUnicode_GET_LENGTH(str);
9247 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009248 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9249 kind, end-start, ch, direction);
9250 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009252 else
9253 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254}
9255
Alexander Belopolsky40018472011-02-26 01:02:56 +00009256static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009257tailmatch(PyObject *self,
9258 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009259 Py_ssize_t start,
9260 Py_ssize_t end,
9261 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind_self;
9264 int kind_sub;
9265 void *data_self;
9266 void *data_sub;
9267 Py_ssize_t offset;
9268 Py_ssize_t i;
9269 Py_ssize_t end_sub;
9270
9271 if (PyUnicode_READY(self) == -1 ||
9272 PyUnicode_READY(substring) == -1)
9273 return 0;
9274
9275 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 return 1;
9277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9279 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 kind_self = PyUnicode_KIND(self);
9284 data_self = PyUnicode_DATA(self);
9285 kind_sub = PyUnicode_KIND(substring);
9286 data_sub = PyUnicode_DATA(substring);
9287 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9288
9289 if (direction > 0)
9290 offset = end;
9291 else
9292 offset = start;
9293
9294 if (PyUnicode_READ(kind_self, data_self, offset) ==
9295 PyUnicode_READ(kind_sub, data_sub, 0) &&
9296 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9297 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9298 /* If both are of the same kind, memcmp is sufficient */
9299 if (kind_self == kind_sub) {
9300 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009301 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 data_sub,
9303 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009304 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 }
9306 /* otherwise we have to compare each character by first accesing it */
9307 else {
9308 /* We do not need to compare 0 and len(substring)-1 because
9309 the if statement above ensured already that they are equal
9310 when we end up here. */
9311 // TODO: honor direction and do a forward or backwards search
9312 for (i = 1; i < end_sub; ++i) {
9313 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9314 PyUnicode_READ(kind_sub, data_sub, i))
9315 return 0;
9316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 }
9320
9321 return 0;
9322}
9323
Alexander Belopolsky40018472011-02-26 01:02:56 +00009324Py_ssize_t
9325PyUnicode_Tailmatch(PyObject *str,
9326 PyObject *substr,
9327 Py_ssize_t start,
9328 Py_ssize_t end,
9329 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009331 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009332
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 str = PyUnicode_FromObject(str);
9334 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336 substr = PyUnicode_FromObject(substr);
9337 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 Py_DECREF(str);
9339 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
Tim Petersced69f82003-09-16 20:30:58 +00009341
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009342 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 Py_DECREF(str);
9345 Py_DECREF(substr);
9346 return result;
9347}
9348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349/* Apply fixfct filter to the Unicode object self and return a
9350 reference to the modified object */
9351
Alexander Belopolsky40018472011-02-26 01:02:56 +00009352static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009353fixup(PyObject *self,
9354 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 PyObject *u;
9357 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358
Victor Stinner87af4f22011-11-21 23:03:47 +01009359 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009362 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 /* fix functions return the new maximum character in a string,
9365 if the kind of the resulting unicode object does not change,
9366 everything is fine. Otherwise we need to change the string kind
9367 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009368 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 if (maxchar_new == 0)
9370 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9371 else if (maxchar_new <= 127)
9372 maxchar_new = 127;
9373 else if (maxchar_new <= 255)
9374 maxchar_new = 255;
9375 else if (maxchar_new <= 65535)
9376 maxchar_new = 65535;
9377 else
9378 maxchar_new = 1114111; /* 0x10ffff */
9379
9380 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 /* fixfct should return TRUE if it modified the buffer. If
9382 FALSE, return a reference to the original buffer instead
9383 (to save space, not time) */
9384 Py_INCREF(self);
9385 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009386 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 else if (maxchar_new == maxchar_old) {
9389 return u;
9390 }
9391 else {
9392 /* In case the maximum character changed, we need to
9393 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009394 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (v == NULL) {
9396 Py_DECREF(u);
9397 return NULL;
9398 }
9399 if (maxchar_new > maxchar_old) {
9400 /* If the maxchar increased so that the kind changed, not all
9401 characters are representable anymore and we need to fix the
9402 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009403 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009404 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9406 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009407 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009408 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410
9411 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009412 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 return v;
9414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
9416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009418fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 /* No need to call PyUnicode_READY(self) because this function is only
9421 called as a callback from fixup() which does it already. */
9422 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9423 const int kind = PyUnicode_KIND(self);
9424 void *data = PyUnicode_DATA(self);
9425 int touched = 0;
9426 Py_UCS4 maxchar = 0;
9427 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 for (i = 0; i < len; ++i) {
9430 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9431 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9432 if (up != ch) {
9433 if (up > maxchar)
9434 maxchar = up;
9435 PyUnicode_WRITE(kind, data, i, up);
9436 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 else if (ch > maxchar)
9439 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 }
9441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 if (touched)
9443 return maxchar;
9444 else
9445 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446}
9447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009449fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9452 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9453 const int kind = PyUnicode_KIND(self);
9454 void *data = PyUnicode_DATA(self);
9455 int touched = 0;
9456 Py_UCS4 maxchar = 0;
9457 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 for(i = 0; i < len; ++i) {
9460 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9461 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9462 if (lo != ch) {
9463 if (lo > maxchar)
9464 maxchar = lo;
9465 PyUnicode_WRITE(kind, data, i, lo);
9466 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 else if (ch > maxchar)
9469 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
9471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (touched)
9473 return maxchar;
9474 else
9475 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009479fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9482 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9483 const int kind = PyUnicode_KIND(self);
9484 void *data = PyUnicode_DATA(self);
9485 int touched = 0;
9486 Py_UCS4 maxchar = 0;
9487 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 for(i = 0; i < len; ++i) {
9490 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9491 Py_UCS4 nu = 0;
9492
9493 if (Py_UNICODE_ISUPPER(ch))
9494 nu = Py_UNICODE_TOLOWER(ch);
9495 else if (Py_UNICODE_ISLOWER(ch))
9496 nu = Py_UNICODE_TOUPPER(ch);
9497
9498 if (nu != 0) {
9499 if (nu > maxchar)
9500 maxchar = nu;
9501 PyUnicode_WRITE(kind, data, i, nu);
9502 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 else if (ch > maxchar)
9505 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 }
9507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 if (touched)
9509 return maxchar;
9510 else
9511 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512}
9513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009515fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9518 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9519 const int kind = PyUnicode_KIND(self);
9520 void *data = PyUnicode_DATA(self);
9521 int touched = 0;
9522 Py_UCS4 maxchar = 0;
9523 Py_ssize_t i = 0;
9524 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009525
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009527 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528
9529 ch = PyUnicode_READ(kind, data, i);
9530 if (!Py_UNICODE_ISUPPER(ch)) {
9531 maxchar = Py_UNICODE_TOUPPER(ch);
9532 PyUnicode_WRITE(kind, data, i, maxchar);
9533 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 ++i;
9536 for(; i < len; ++i) {
9537 ch = PyUnicode_READ(kind, data, i);
9538 if (!Py_UNICODE_ISLOWER(ch)) {
9539 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9540 if (lo > maxchar)
9541 maxchar = lo;
9542 PyUnicode_WRITE(kind, data, i, lo);
9543 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 else if (ch > maxchar)
9546 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548
9549 if (touched)
9550 return maxchar;
9551 else
9552 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553}
9554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009556fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9559 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9560 const int kind = PyUnicode_KIND(self);
9561 void *data = PyUnicode_DATA(self);
9562 Py_UCS4 maxchar = 0;
9563 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564 int previous_is_cased;
9565
9566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567 if (len == 1) {
9568 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9569 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9570 if (ti != ch) {
9571 PyUnicode_WRITE(kind, data, i, ti);
9572 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 }
9574 else
9575 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009578 for(; i < len; ++i) {
9579 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9580 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009581
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009584 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 nu = Py_UNICODE_TOTITLE(ch);
9586
9587 if (nu > maxchar)
9588 maxchar = nu;
9589 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009590
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 if (Py_UNICODE_ISLOWER(ch) ||
9592 Py_UNICODE_ISUPPER(ch) ||
9593 Py_UNICODE_ISTITLE(ch))
9594 previous_is_cased = 1;
9595 else
9596 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599}
9600
Tim Peters8ce9f162004-08-27 01:49:32 +00009601PyObject *
9602PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009605 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009607 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009608 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9609 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009610 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009612 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 int use_memcpy;
9615 unsigned char *res_data = NULL, *sep_data = NULL;
9616 PyObject *last_obj;
9617 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618
Tim Peters05eba1f2004-08-27 21:32:02 +00009619 fseq = PySequence_Fast(seq, "");
9620 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009621 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009622 }
9623
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009624 /* NOTE: the following code can't call back into Python code,
9625 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009626 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009627
Tim Peters05eba1f2004-08-27 21:32:02 +00009628 seqlen = PySequence_Fast_GET_SIZE(fseq);
9629 /* If empty sequence, return u"". */
9630 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009631 Py_DECREF(fseq);
9632 Py_INCREF(unicode_empty);
9633 res = unicode_empty;
9634 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009636
Tim Peters05eba1f2004-08-27 21:32:02 +00009637 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009638 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009639 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009640 if (seqlen == 1) {
9641 if (PyUnicode_CheckExact(items[0])) {
9642 res = items[0];
9643 Py_INCREF(res);
9644 Py_DECREF(fseq);
9645 return res;
9646 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009647 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009648 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009649 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009650 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009651 /* Set up sep and seplen */
9652 if (separator == NULL) {
9653 /* fall back to a blank space separator */
9654 sep = PyUnicode_FromOrdinal(' ');
9655 if (!sep)
9656 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009657 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009658 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009659 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009660 else {
9661 if (!PyUnicode_Check(separator)) {
9662 PyErr_Format(PyExc_TypeError,
9663 "separator: expected str instance,"
9664 " %.80s found",
9665 Py_TYPE(separator)->tp_name);
9666 goto onError;
9667 }
9668 if (PyUnicode_READY(separator))
9669 goto onError;
9670 sep = separator;
9671 seplen = PyUnicode_GET_LENGTH(separator);
9672 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9673 /* inc refcount to keep this code path symmetric with the
9674 above case of a blank separator */
9675 Py_INCREF(sep);
9676 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009677 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009678 }
9679
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009680 /* There are at least two things to join, or else we have a subclass
9681 * of str in the sequence.
9682 * Do a pre-pass to figure out the total amount of space we'll
9683 * need (sz), and see whether all argument are strings.
9684 */
9685 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009686#ifdef Py_DEBUG
9687 use_memcpy = 0;
9688#else
9689 use_memcpy = 1;
9690#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009691 for (i = 0; i < seqlen; i++) {
9692 const Py_ssize_t old_sz = sz;
9693 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009694 if (!PyUnicode_Check(item)) {
9695 PyErr_Format(PyExc_TypeError,
9696 "sequence item %zd: expected str instance,"
9697 " %.80s found",
9698 i, Py_TYPE(item)->tp_name);
9699 goto onError;
9700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 if (PyUnicode_READY(item) == -1)
9702 goto onError;
9703 sz += PyUnicode_GET_LENGTH(item);
9704 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009705 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009706 if (i != 0)
9707 sz += seplen;
9708 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9709 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009710 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009711 goto onError;
9712 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009713 if (use_memcpy && last_obj != NULL) {
9714 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9715 use_memcpy = 0;
9716 }
9717 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009718 }
Tim Petersced69f82003-09-16 20:30:58 +00009719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009721 if (res == NULL)
9722 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009723
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009725#ifdef Py_DEBUG
9726 use_memcpy = 0;
9727#else
9728 if (use_memcpy) {
9729 res_data = PyUnicode_1BYTE_DATA(res);
9730 kind = PyUnicode_KIND(res);
9731 if (seplen != 0)
9732 sep_data = PyUnicode_1BYTE_DATA(sep);
9733 }
9734#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009736 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009737 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009739 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009740 if (use_memcpy) {
9741 Py_MEMCPY(res_data,
9742 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009743 kind * seplen);
9744 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009745 }
9746 else {
9747 copy_characters(res, res_offset, sep, 0, seplen);
9748 res_offset += seplen;
9749 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009750 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009751 itemlen = PyUnicode_GET_LENGTH(item);
9752 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009753 if (use_memcpy) {
9754 Py_MEMCPY(res_data,
9755 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009756 kind * itemlen);
9757 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009758 }
9759 else {
9760 copy_characters(res, res_offset, item, 0, itemlen);
9761 res_offset += itemlen;
9762 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009763 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009764 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009765 if (use_memcpy)
9766 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009767 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009768 else
9769 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009770
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009773 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775
Benjamin Peterson29060642009-01-31 22:14:21 +00009776 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009777 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009779 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 return NULL;
9781}
9782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783#define FILL(kind, data, value, start, length) \
9784 do { \
9785 Py_ssize_t i_ = 0; \
9786 assert(kind != PyUnicode_WCHAR_KIND); \
9787 switch ((kind)) { \
9788 case PyUnicode_1BYTE_KIND: { \
9789 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9790 memset(to_, (unsigned char)value, length); \
9791 break; \
9792 } \
9793 case PyUnicode_2BYTE_KIND: { \
9794 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9795 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9796 break; \
9797 } \
9798 default: { \
9799 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9800 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9801 break; \
9802 } \
9803 } \
9804 } while (0)
9805
Victor Stinner9310abb2011-10-05 00:59:23 +02009806static PyObject *
9807pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009808 Py_ssize_t left,
9809 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 PyObject *u;
9813 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009814 int kind;
9815 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816
9817 if (left < 0)
9818 left = 0;
9819 if (right < 0)
9820 right = 0;
9821
Tim Peters7a29bd52001-09-12 03:03:31 +00009822 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 Py_INCREF(self);
9824 return self;
9825 }
9826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9828 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009829 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9830 return NULL;
9831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9833 if (fill > maxchar)
9834 maxchar = fill;
9835 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009836 if (!u)
9837 return NULL;
9838
9839 kind = PyUnicode_KIND(u);
9840 data = PyUnicode_DATA(u);
9841 if (left)
9842 FILL(kind, data, fill, 0, left);
9843 if (right)
9844 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009845 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009846 assert(_PyUnicode_CheckConsistency(u, 1));
9847 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
Alexander Belopolsky40018472011-02-26 01:02:56 +00009851PyObject *
9852PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
9856 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 switch(PyUnicode_KIND(string)) {
9861 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862 if (PyUnicode_IS_ASCII(string))
9863 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009864 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009865 PyUnicode_GET_LENGTH(string), keepends);
9866 else
9867 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 break;
9871 case PyUnicode_2BYTE_KIND:
9872 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 PyUnicode_GET_LENGTH(string), keepends);
9875 break;
9876 case PyUnicode_4BYTE_KIND:
9877 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009878 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 PyUnicode_GET_LENGTH(string), keepends);
9880 break;
9881 default:
9882 assert(0);
9883 list = 0;
9884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885 Py_DECREF(string);
9886 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887}
9888
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009890split(PyObject *self,
9891 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009892 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 int kind1, kind2, kind;
9895 void *buf1, *buf2;
9896 Py_ssize_t len1, len2;
9897 PyObject* out;
9898
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009900 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (PyUnicode_READY(self) == -1)
9903 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (substring == NULL)
9906 switch(PyUnicode_KIND(self)) {
9907 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 if (PyUnicode_IS_ASCII(self))
9909 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009910 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009911 PyUnicode_GET_LENGTH(self), maxcount
9912 );
9913 else
9914 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 case PyUnicode_2BYTE_KIND:
9919 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 case PyUnicode_4BYTE_KIND:
9924 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 default:
9929 assert(0);
9930 return NULL;
9931 }
9932
9933 if (PyUnicode_READY(substring) == -1)
9934 return NULL;
9935
9936 kind1 = PyUnicode_KIND(self);
9937 kind2 = PyUnicode_KIND(substring);
9938 kind = kind1 > kind2 ? kind1 : kind2;
9939 buf1 = PyUnicode_DATA(self);
9940 buf2 = PyUnicode_DATA(substring);
9941 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009942 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (!buf1)
9944 return NULL;
9945 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009946 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 if (!buf2) {
9948 if (kind1 != kind) PyMem_Free(buf1);
9949 return NULL;
9950 }
9951 len1 = PyUnicode_GET_LENGTH(self);
9952 len2 = PyUnicode_GET_LENGTH(substring);
9953
9954 switch(kind) {
9955 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009956 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9957 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009959 else
9960 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 break;
9963 case PyUnicode_2BYTE_KIND:
9964 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009965 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 break;
9967 case PyUnicode_4BYTE_KIND:
9968 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009969 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 break;
9971 default:
9972 out = NULL;
9973 }
9974 if (kind1 != kind)
9975 PyMem_Free(buf1);
9976 if (kind2 != kind)
9977 PyMem_Free(buf2);
9978 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979}
9980
Alexander Belopolsky40018472011-02-26 01:02:56 +00009981static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009982rsplit(PyObject *self,
9983 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009984 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 int kind1, kind2, kind;
9987 void *buf1, *buf2;
9988 Py_ssize_t len1, len2;
9989 PyObject* out;
9990
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009991 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009992 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 if (PyUnicode_READY(self) == -1)
9995 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 if (substring == NULL)
9998 switch(PyUnicode_KIND(self)) {
9999 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 if (PyUnicode_IS_ASCII(self))
10001 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010002 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003 PyUnicode_GET_LENGTH(self), maxcount
10004 );
10005 else
10006 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 PyUnicode_GET_LENGTH(self), maxcount
10009 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 case PyUnicode_2BYTE_KIND:
10011 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
10015 case PyUnicode_4BYTE_KIND:
10016 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
10020 default:
10021 assert(0);
10022 return NULL;
10023 }
10024
10025 if (PyUnicode_READY(substring) == -1)
10026 return NULL;
10027
10028 kind1 = PyUnicode_KIND(self);
10029 kind2 = PyUnicode_KIND(substring);
10030 kind = kind1 > kind2 ? kind1 : kind2;
10031 buf1 = PyUnicode_DATA(self);
10032 buf2 = PyUnicode_DATA(substring);
10033 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 if (!buf1)
10036 return NULL;
10037 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010038 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 if (!buf2) {
10040 if (kind1 != kind) PyMem_Free(buf1);
10041 return NULL;
10042 }
10043 len1 = PyUnicode_GET_LENGTH(self);
10044 len2 = PyUnicode_GET_LENGTH(substring);
10045
10046 switch(kind) {
10047 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10049 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010051 else
10052 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010053 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 break;
10055 case PyUnicode_2BYTE_KIND:
10056 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010057 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 break;
10059 case PyUnicode_4BYTE_KIND:
10060 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010061 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 break;
10063 default:
10064 out = NULL;
10065 }
10066 if (kind1 != kind)
10067 PyMem_Free(buf1);
10068 if (kind2 != kind)
10069 PyMem_Free(buf2);
10070 return out;
10071}
10072
10073static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10075 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076{
10077 switch(kind) {
10078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10080 return asciilib_find(buf1, len1, buf2, len2, offset);
10081 else
10082 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 case PyUnicode_2BYTE_KIND:
10084 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10085 case PyUnicode_4BYTE_KIND:
10086 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10087 }
10088 assert(0);
10089 return -1;
10090}
10091
10092static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010093anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10094 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095{
10096 switch(kind) {
10097 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10099 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10100 else
10101 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 case PyUnicode_2BYTE_KIND:
10103 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10104 case PyUnicode_4BYTE_KIND:
10105 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10106 }
10107 assert(0);
10108 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010109}
10110
Alexander Belopolsky40018472011-02-26 01:02:56 +000010111static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112replace(PyObject *self, PyObject *str1,
10113 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 PyObject *u;
10116 char *sbuf = PyUnicode_DATA(self);
10117 char *buf1 = PyUnicode_DATA(str1);
10118 char *buf2 = PyUnicode_DATA(str2);
10119 int srelease = 0, release1 = 0, release2 = 0;
10120 int skind = PyUnicode_KIND(self);
10121 int kind1 = PyUnicode_KIND(str1);
10122 int kind2 = PyUnicode_KIND(str2);
10123 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10124 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10125 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 int mayshrink;
10127 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128
10129 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010132 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
Victor Stinner59de0ee2011-10-07 10:01:28 +020010134 if (str1 == str2)
10135 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 if (skind < kind1)
10137 /* substring too wide to be present */
10138 goto nothing;
10139
Victor Stinner49a0a212011-10-12 23:46:10 +020010140 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10141 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10142 /* Replacing str1 with str2 may cause a maxchar reduction in the
10143 result string. */
10144 mayshrink = (maxchar_str2 < maxchar);
10145 maxchar = Py_MAX(maxchar, maxchar_str2);
10146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010148 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010151 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010153 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010154 Py_UCS4 u1, u2;
10155 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010157 if (findchar(sbuf, PyUnicode_KIND(self),
10158 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010164 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 rkind = PyUnicode_KIND(u);
10166 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10167 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 if (--maxcount < 0)
10169 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010172 }
10173 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 int rkind = skind;
10175 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (kind1 < rkind) {
10178 /* widen substring */
10179 buf1 = _PyUnicode_AsKind(str1, rkind);
10180 if (!buf1) goto error;
10181 release1 = 1;
10182 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 if (i < 0)
10185 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (rkind > kind2) {
10187 /* widen replacement */
10188 buf2 = _PyUnicode_AsKind(str2, rkind);
10189 if (!buf2) goto error;
10190 release2 = 1;
10191 }
10192 else if (rkind < kind2) {
10193 /* widen self and buf1 */
10194 rkind = kind2;
10195 if (release1) PyMem_Free(buf1);
10196 sbuf = _PyUnicode_AsKind(self, rkind);
10197 if (!sbuf) goto error;
10198 srelease = 1;
10199 buf1 = _PyUnicode_AsKind(str1, rkind);
10200 if (!buf1) goto error;
10201 release1 = 1;
10202 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010203 u = PyUnicode_New(slen, maxchar);
10204 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010206 assert(PyUnicode_KIND(u) == rkind);
10207 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010208
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010209 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010210 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010211 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010213 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010215
10216 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010217 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010219 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010220 if (i == -1)
10221 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010222 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010228 }
10229 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 Py_ssize_t n, i, j, ires;
10231 Py_ssize_t product, new_size;
10232 int rkind = skind;
10233 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 buf1 = _PyUnicode_AsKind(str1, rkind);
10238 if (!buf1) goto error;
10239 release1 = 1;
10240 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010241 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 if (n == 0)
10243 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010245 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 buf2 = _PyUnicode_AsKind(str2, rkind);
10247 if (!buf2) goto error;
10248 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010251 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 rkind = kind2;
10253 sbuf = _PyUnicode_AsKind(self, rkind);
10254 if (!sbuf) goto error;
10255 srelease = 1;
10256 if (release1) PyMem_Free(buf1);
10257 buf1 = _PyUnicode_AsKind(str1, rkind);
10258 if (!buf1) goto error;
10259 release1 = 1;
10260 }
10261 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10262 PyUnicode_GET_LENGTH(str1))); */
10263 product = n * (len2-len1);
10264 if ((product / (len2-len1)) != n) {
10265 PyErr_SetString(PyExc_OverflowError,
10266 "replace string is too long");
10267 goto error;
10268 }
10269 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010270 if (new_size == 0) {
10271 Py_INCREF(unicode_empty);
10272 u = unicode_empty;
10273 goto done;
10274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10276 PyErr_SetString(PyExc_OverflowError,
10277 "replace string is too long");
10278 goto error;
10279 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 u = PyUnicode_New(new_size, maxchar);
10281 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010283 assert(PyUnicode_KIND(u) == rkind);
10284 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 ires = i = 0;
10286 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 while (n-- > 0) {
10288 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010289 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010290 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010291 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010292 if (j == -1)
10293 break;
10294 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010296 memcpy(res + rkind * ires,
10297 sbuf + rkind * i,
10298 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010300 }
10301 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010305 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010311 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010312 memcpy(res + rkind * ires,
10313 sbuf + rkind * i,
10314 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010315 }
10316 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 /* interleave */
10318 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010319 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010321 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 if (--n <= 0)
10324 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 memcpy(res + rkind * ires,
10326 sbuf + rkind * i,
10327 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 ires++;
10329 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 memcpy(res + rkind * ires,
10332 sbuf + rkind * i,
10333 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010334 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010335 }
10336
10337 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010338 unicode_adjust_maxchar(&u);
10339 if (u == NULL)
10340 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010342
10343 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (srelease)
10345 PyMem_FREE(sbuf);
10346 if (release1)
10347 PyMem_FREE(buf1);
10348 if (release2)
10349 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010350 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010352
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010354 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 if (srelease)
10356 PyMem_FREE(sbuf);
10357 if (release1)
10358 PyMem_FREE(buf1);
10359 if (release2)
10360 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010361 if (PyUnicode_CheckExact(self)) {
10362 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010364 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010365 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 error:
10367 if (srelease && sbuf)
10368 PyMem_FREE(sbuf);
10369 if (release1 && buf1)
10370 PyMem_FREE(buf1);
10371 if (release2 && buf2)
10372 PyMem_FREE(buf2);
10373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374}
10375
10376/* --- Unicode Object Methods --------------------------------------------- */
10377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010378PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010379 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380\n\
10381Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010382characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383
10384static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010385unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387 return fixup(self, fixtitle);
10388}
10389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010390PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010391 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392\n\
10393Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010394have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395
10396static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010397unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 return fixup(self, fixcapitalize);
10400}
10401
10402#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010403PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405\n\
10406Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010407normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408
10409static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010410unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411{
10412 PyObject *list;
10413 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010414 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416 /* Split into words */
10417 list = split(self, NULL, -1);
10418 if (!list)
10419 return NULL;
10420
10421 /* Capitalize each word */
10422 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010423 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425 if (item == NULL)
10426 goto onError;
10427 Py_DECREF(PyList_GET_ITEM(list, i));
10428 PyList_SET_ITEM(list, i, item);
10429 }
10430
10431 /* Join the words to form a new string */
10432 item = PyUnicode_Join(NULL, list);
10433
Benjamin Peterson29060642009-01-31 22:14:21 +000010434 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437}
10438#endif
10439
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010440/* Argument converter. Coerces to a single unicode character */
10441
10442static int
10443convert_uc(PyObject *obj, void *addr)
10444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010446 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010447
Benjamin Peterson14339b62009-01-31 16:36:08 +000010448 uniobj = PyUnicode_FromObject(obj);
10449 if (uniobj == NULL) {
10450 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010451 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010452 return 0;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010455 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010456 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010457 Py_DECREF(uniobj);
10458 return 0;
10459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 Py_DECREF(uniobj);
10462 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010463}
10464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010465PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010468Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010469done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470
10471static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010472unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010474 Py_ssize_t marg, left;
10475 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 Py_UCS4 fillchar = ' ';
10477
Victor Stinnere9a29352011-10-01 02:14:59 +020010478 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
Victor Stinnere9a29352011-10-01 02:14:59 +020010481 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482 return NULL;
10483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010486 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 }
10488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 left = marg / 2 + (marg & width & 1);
10491
Victor Stinner9310abb2011-10-05 00:59:23 +020010492 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493}
10494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495/* This function assumes that str1 and str2 are readied by the caller. */
10496
Marc-André Lemburge5034372000-08-08 08:04:29 +000010497static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010498unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 int kind1, kind2;
10501 void *data1, *data2;
10502 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 kind1 = PyUnicode_KIND(str1);
10505 kind2 = PyUnicode_KIND(str2);
10506 data1 = PyUnicode_DATA(str1);
10507 data2 = PyUnicode_DATA(str2);
10508 len1 = PyUnicode_GET_LENGTH(str1);
10509 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 for (i = 0; i < len1 && i < len2; ++i) {
10512 Py_UCS4 c1, c2;
10513 c1 = PyUnicode_READ(kind1, data1, i);
10514 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010515
10516 if (c1 != c2)
10517 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010518 }
10519
10520 return (len1 < len2) ? -1 : (len1 != len2);
10521}
10522
Alexander Belopolsky40018472011-02-26 01:02:56 +000010523int
10524PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10527 if (PyUnicode_READY(left) == -1 ||
10528 PyUnicode_READY(right) == -1)
10529 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010530 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010532 PyErr_Format(PyExc_TypeError,
10533 "Can't compare %.100s and %.100s",
10534 left->ob_type->tp_name,
10535 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536 return -1;
10537}
10538
Martin v. Löwis5b222132007-06-10 09:51:05 +000010539int
10540PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 Py_ssize_t i;
10543 int kind;
10544 void *data;
10545 Py_UCS4 chr;
10546
Victor Stinner910337b2011-10-03 03:20:16 +020010547 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (PyUnicode_READY(uni) == -1)
10549 return -1;
10550 kind = PyUnicode_KIND(uni);
10551 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010552 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10554 if (chr != str[i])
10555 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010556 /* This check keeps Python strings that end in '\0' from comparing equal
10557 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010559 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010560 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010562 return 0;
10563}
10564
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010565
Benjamin Peterson29060642009-01-31 22:14:21 +000010566#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010568
Alexander Belopolsky40018472011-02-26 01:02:56 +000010569PyObject *
10570PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010571{
10572 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010574 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10575 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 if (PyUnicode_READY(left) == -1 ||
10577 PyUnicode_READY(right) == -1)
10578 return NULL;
10579 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10580 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010581 if (op == Py_EQ) {
10582 Py_INCREF(Py_False);
10583 return Py_False;
10584 }
10585 if (op == Py_NE) {
10586 Py_INCREF(Py_True);
10587 return Py_True;
10588 }
10589 }
10590 if (left == right)
10591 result = 0;
10592 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010593 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010594
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010595 /* Convert the return value to a Boolean */
10596 switch (op) {
10597 case Py_EQ:
10598 v = TEST_COND(result == 0);
10599 break;
10600 case Py_NE:
10601 v = TEST_COND(result != 0);
10602 break;
10603 case Py_LE:
10604 v = TEST_COND(result <= 0);
10605 break;
10606 case Py_GE:
10607 v = TEST_COND(result >= 0);
10608 break;
10609 case Py_LT:
10610 v = TEST_COND(result == -1);
10611 break;
10612 case Py_GT:
10613 v = TEST_COND(result == 1);
10614 break;
10615 default:
10616 PyErr_BadArgument();
10617 return NULL;
10618 }
10619 Py_INCREF(v);
10620 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010622
Brian Curtindfc80e32011-08-10 20:28:54 -050010623 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010624}
10625
Alexander Belopolsky40018472011-02-26 01:02:56 +000010626int
10627PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010628{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 int kind1, kind2, kind;
10631 void *buf1, *buf2;
10632 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010633 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010634
10635 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 sub = PyUnicode_FromObject(element);
10637 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010638 PyErr_Format(PyExc_TypeError,
10639 "'in <string>' requires string as left operand, not %s",
10640 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (PyUnicode_READY(sub) == -1)
10644 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010645
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010647 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 Py_DECREF(sub);
10649 return -1;
10650 }
10651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 kind1 = PyUnicode_KIND(str);
10653 kind2 = PyUnicode_KIND(sub);
10654 kind = kind1 > kind2 ? kind1 : kind2;
10655 buf1 = PyUnicode_DATA(str);
10656 buf2 = PyUnicode_DATA(sub);
10657 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010658 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (!buf1) {
10660 Py_DECREF(sub);
10661 return -1;
10662 }
10663 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010664 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 if (!buf2) {
10666 Py_DECREF(sub);
10667 if (kind1 != kind) PyMem_Free(buf1);
10668 return -1;
10669 }
10670 len1 = PyUnicode_GET_LENGTH(str);
10671 len2 = PyUnicode_GET_LENGTH(sub);
10672
10673 switch(kind) {
10674 case PyUnicode_1BYTE_KIND:
10675 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10676 break;
10677 case PyUnicode_2BYTE_KIND:
10678 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10679 break;
10680 case PyUnicode_4BYTE_KIND:
10681 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10682 break;
10683 default:
10684 result = -1;
10685 assert(0);
10686 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687
10688 Py_DECREF(str);
10689 Py_DECREF(sub);
10690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (kind1 != kind)
10692 PyMem_Free(buf1);
10693 if (kind2 != kind)
10694 PyMem_Free(buf2);
10695
Guido van Rossum403d68b2000-03-13 15:55:09 +000010696 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010697}
10698
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699/* Concat to string or Unicode object giving a new Unicode object. */
10700
Alexander Belopolsky40018472011-02-26 01:02:56 +000010701PyObject *
10702PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010705 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010716 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010717 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010720 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 }
10724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010726 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10727 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 w = PyUnicode_New(
10731 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10732 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010735 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10736 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 Py_DECREF(u);
10738 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010739 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 Py_XDECREF(u);
10744 Py_XDECREF(v);
10745 return NULL;
10746}
10747
Victor Stinnerb0923652011-10-04 01:17:31 +020010748static void
10749unicode_append_inplace(PyObject **p_left, PyObject *right)
10750{
10751 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010752
10753 assert(PyUnicode_IS_READY(*p_left));
10754 assert(PyUnicode_IS_READY(right));
10755
10756 left_len = PyUnicode_GET_LENGTH(*p_left);
10757 right_len = PyUnicode_GET_LENGTH(right);
10758 if (left_len > PY_SSIZE_T_MAX - right_len) {
10759 PyErr_SetString(PyExc_OverflowError,
10760 "strings are too large to concat");
10761 goto error;
10762 }
10763 new_len = left_len + right_len;
10764
10765 /* Now we own the last reference to 'left', so we can resize it
10766 * in-place.
10767 */
10768 if (unicode_resize(p_left, new_len) != 0) {
10769 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10770 * deallocated so it cannot be put back into
10771 * 'variable'. The MemoryError is raised when there
10772 * is no value in 'variable', which might (very
10773 * remotely) be a cause of incompatibilities.
10774 */
10775 goto error;
10776 }
10777 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010778 copy_characters(*p_left, left_len, right, 0, right_len);
10779 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010780 return;
10781
10782error:
10783 Py_DECREF(*p_left);
10784 *p_left = NULL;
10785}
10786
Walter Dörwald1ab83302007-05-18 17:15:44 +000010787void
Victor Stinner23e56682011-10-03 03:54:37 +020010788PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010789{
Victor Stinner23e56682011-10-03 03:54:37 +020010790 PyObject *left, *res;
10791
10792 if (p_left == NULL) {
10793 if (!PyErr_Occurred())
10794 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 return;
10796 }
Victor Stinner23e56682011-10-03 03:54:37 +020010797 left = *p_left;
10798 if (right == NULL || !PyUnicode_Check(left)) {
10799 if (!PyErr_Occurred())
10800 PyErr_BadInternalCall();
10801 goto error;
10802 }
10803
Victor Stinnere1335c72011-10-04 20:53:03 +020010804 if (PyUnicode_READY(left))
10805 goto error;
10806 if (PyUnicode_READY(right))
10807 goto error;
10808
Victor Stinner23e56682011-10-03 03:54:37 +020010809 if (PyUnicode_CheckExact(left) && left != unicode_empty
10810 && PyUnicode_CheckExact(right) && right != unicode_empty
10811 && unicode_resizable(left)
10812 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10813 || _PyUnicode_WSTR(left) != NULL))
10814 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010815 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10816 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010817 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010818 not so different than duplicating the string. */
10819 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010820 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010821 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010822 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010823 return;
10824 }
10825 }
10826
10827 res = PyUnicode_Concat(left, right);
10828 if (res == NULL)
10829 goto error;
10830 Py_DECREF(left);
10831 *p_left = res;
10832 return;
10833
10834error:
10835 Py_DECREF(*p_left);
10836 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010837}
10838
10839void
10840PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10841{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010842 PyUnicode_Append(pleft, right);
10843 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010844}
10845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010849Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010850string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010851interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852
10853static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010854unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010856 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010857 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010858 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 int kind1, kind2, kind;
10861 void *buf1, *buf2;
10862 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863
Jesus Ceaac451502011-04-20 17:09:23 +020010864 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10865 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 kind1 = PyUnicode_KIND(self);
10869 kind2 = PyUnicode_KIND(substring);
10870 kind = kind1 > kind2 ? kind1 : kind2;
10871 buf1 = PyUnicode_DATA(self);
10872 buf2 = PyUnicode_DATA(substring);
10873 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010874 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 if (!buf1) {
10876 Py_DECREF(substring);
10877 return NULL;
10878 }
10879 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010880 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 if (!buf2) {
10882 Py_DECREF(substring);
10883 if (kind1 != kind) PyMem_Free(buf1);
10884 return NULL;
10885 }
10886 len1 = PyUnicode_GET_LENGTH(self);
10887 len2 = PyUnicode_GET_LENGTH(substring);
10888
10889 ADJUST_INDICES(start, end, len1);
10890 switch(kind) {
10891 case PyUnicode_1BYTE_KIND:
10892 iresult = ucs1lib_count(
10893 ((Py_UCS1*)buf1) + start, end - start,
10894 buf2, len2, PY_SSIZE_T_MAX
10895 );
10896 break;
10897 case PyUnicode_2BYTE_KIND:
10898 iresult = ucs2lib_count(
10899 ((Py_UCS2*)buf1) + start, end - start,
10900 buf2, len2, PY_SSIZE_T_MAX
10901 );
10902 break;
10903 case PyUnicode_4BYTE_KIND:
10904 iresult = ucs4lib_count(
10905 ((Py_UCS4*)buf1) + start, end - start,
10906 buf2, len2, PY_SSIZE_T_MAX
10907 );
10908 break;
10909 default:
10910 assert(0); iresult = 0;
10911 }
10912
10913 result = PyLong_FromSsize_t(iresult);
10914
10915 if (kind1 != kind)
10916 PyMem_Free(buf1);
10917 if (kind2 != kind)
10918 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
10920 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 return result;
10923}
10924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010925PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010926 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010928Encode S using the codec registered for encoding. Default encoding\n\
10929is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010930handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010931a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10932'xmlcharrefreplace' as well as any other name registered with\n\
10933codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934
10935static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010936unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010938 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939 char *encoding = NULL;
10940 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010941
Benjamin Peterson308d6372009-09-18 21:42:35 +000010942 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10943 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010945 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010946}
10947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950\n\
10951Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010952If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
10954static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010955unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010957 Py_ssize_t i, j, line_pos, src_len, incr;
10958 Py_UCS4 ch;
10959 PyObject *u;
10960 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010962 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010963 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010966 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
Antoine Pitrou22425222011-10-04 19:10:51 +020010968 if (PyUnicode_READY(self) == -1)
10969 return NULL;
10970
Thomas Wouters7e474022000-07-16 12:04:32 +000010971 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010972 src_len = PyUnicode_GET_LENGTH(self);
10973 i = j = line_pos = 0;
10974 kind = PyUnicode_KIND(self);
10975 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010976 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 for (; i < src_len; i++) {
10978 ch = PyUnicode_READ(kind, src_data, i);
10979 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010980 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010984 goto overflow;
10985 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010987 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010991 goto overflow;
10992 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 if (ch == '\n' || ch == '\r')
10995 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010997 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010998 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010999 Py_INCREF(self);
11000 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011001 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011002
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 if (!u)
11006 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008
Antoine Pitroue71d5742011-10-04 15:55:09 +020011009 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
Antoine Pitroue71d5742011-10-04 15:55:09 +020011011 for (; i < src_len; i++) {
11012 ch = PyUnicode_READ(kind, src_data, i);
11013 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011015 incr = tabsize - (line_pos % tabsize);
11016 line_pos += incr;
11017 while (incr--) {
11018 PyUnicode_WRITE(kind, dest_data, j, ' ');
11019 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011020 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011022 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 line_pos++;
11025 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011026 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 if (ch == '\n' || ch == '\r')
11028 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011030 }
11031 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011032 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011033
Antoine Pitroue71d5742011-10-04 15:55:09 +020011034 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011035 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037}
11038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011039PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041\n\
11042Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011043such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044arguments start and end are interpreted as in slice notation.\n\
11045\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011046Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
11048static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011051 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011052 Py_ssize_t start;
11053 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011054 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055
Jesus Ceaac451502011-04-20 17:09:23 +020011056 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11057 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (PyUnicode_READY(self) == -1)
11061 return NULL;
11062 if (PyUnicode_READY(substring) == -1)
11063 return NULL;
11064
Victor Stinner7931d9a2011-11-04 00:22:48 +010011065 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066
11067 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (result == -2)
11070 return NULL;
11071
Christian Heimes217cfd12007-12-02 14:31:20 +000011072 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073}
11074
11075static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011076unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011078 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11079 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082}
11083
Guido van Rossumc2504932007-09-18 19:42:40 +000011084/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011085 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011086static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011087unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088{
Guido van Rossumc2504932007-09-18 19:42:40 +000011089 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011090 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (_PyUnicode_HASH(self) != -1)
11093 return _PyUnicode_HASH(self);
11094 if (PyUnicode_READY(self) == -1)
11095 return -1;
11096 len = PyUnicode_GET_LENGTH(self);
11097
11098 /* The hash function as a macro, gets expanded three times below. */
11099#define HASH(P) \
11100 x = (Py_uhash_t)*P << 7; \
11101 while (--len >= 0) \
11102 x = (1000003*x) ^ (Py_uhash_t)*P++;
11103
11104 switch (PyUnicode_KIND(self)) {
11105 case PyUnicode_1BYTE_KIND: {
11106 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11107 HASH(c);
11108 break;
11109 }
11110 case PyUnicode_2BYTE_KIND: {
11111 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11112 HASH(s);
11113 break;
11114 }
11115 default: {
11116 Py_UCS4 *l;
11117 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11118 "Impossible switch case in unicode_hash");
11119 l = PyUnicode_4BYTE_DATA(self);
11120 HASH(l);
11121 break;
11122 }
11123 }
11124 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11125
Guido van Rossumc2504932007-09-18 19:42:40 +000011126 if (x == -1)
11127 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011129 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011133PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011136Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
11138static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011142 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011143 Py_ssize_t start;
11144 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
Jesus Ceaac451502011-04-20 17:09:23 +020011146 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11147 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (PyUnicode_READY(self) == -1)
11151 return NULL;
11152 if (PyUnicode_READY(substring) == -1)
11153 return NULL;
11154
Victor Stinner7931d9a2011-11-04 00:22:48 +010011155 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156
11157 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (result == -2)
11160 return NULL;
11161
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 if (result < 0) {
11163 PyErr_SetString(PyExc_ValueError, "substring not found");
11164 return NULL;
11165 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
Christian Heimes217cfd12007-12-02 14:31:20 +000011167 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168}
11169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011173Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011174at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
11176static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011177unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 Py_ssize_t i, length;
11180 int kind;
11181 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 int cased;
11183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (PyUnicode_READY(self) == -1)
11185 return NULL;
11186 length = PyUnicode_GET_LENGTH(self);
11187 kind = PyUnicode_KIND(self);
11188 data = PyUnicode_DATA(self);
11189
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 if (length == 1)
11192 return PyBool_FromLong(
11193 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011195 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 for (i = 0; i < length; i++) {
11201 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011202
Benjamin Peterson29060642009-01-31 22:14:21 +000011203 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11204 return PyBool_FromLong(0);
11205 else if (!cased && Py_UNICODE_ISLOWER(ch))
11206 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011208 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011214Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011215at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
11217static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011218unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 Py_ssize_t i, length;
11221 int kind;
11222 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 int cased;
11224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (PyUnicode_READY(self) == -1)
11226 return NULL;
11227 length = PyUnicode_GET_LENGTH(self);
11228 kind = PyUnicode_KIND(self);
11229 data = PyUnicode_DATA(self);
11230
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if (length == 1)
11233 return PyBool_FromLong(
11234 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011236 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011239
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 for (i = 0; i < length; i++) {
11242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011243
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11245 return PyBool_FromLong(0);
11246 else if (!cased && Py_UNICODE_ISUPPER(ch))
11247 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011249 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250}
11251
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011252PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011255Return True if S is a titlecased string and there is at least one\n\
11256character in S, i.e. upper- and titlecase characters may only\n\
11257follow uncased characters and lowercase characters only cased ones.\n\
11258Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011261unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 Py_ssize_t i, length;
11264 int kind;
11265 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 int cased, previous_is_cased;
11267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 if (PyUnicode_READY(self) == -1)
11269 return NULL;
11270 length = PyUnicode_GET_LENGTH(self);
11271 kind = PyUnicode_KIND(self);
11272 data = PyUnicode_DATA(self);
11273
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (length == 1) {
11276 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11277 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11278 (Py_UNICODE_ISUPPER(ch) != 0));
11279 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011281 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011284
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285 cased = 0;
11286 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 for (i = 0; i < length; i++) {
11288 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011289
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11291 if (previous_is_cased)
11292 return PyBool_FromLong(0);
11293 previous_is_cased = 1;
11294 cased = 1;
11295 }
11296 else if (Py_UNICODE_ISLOWER(ch)) {
11297 if (!previous_is_cased)
11298 return PyBool_FromLong(0);
11299 previous_is_cased = 1;
11300 cased = 1;
11301 }
11302 else
11303 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011305 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306}
11307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011311Return True if all characters in S are whitespace\n\
11312and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011315unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 Py_ssize_t i, length;
11318 int kind;
11319 void *data;
11320
11321 if (PyUnicode_READY(self) == -1)
11322 return NULL;
11323 length = PyUnicode_GET_LENGTH(self);
11324 kind = PyUnicode_KIND(self);
11325 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 if (length == 1)
11329 return PyBool_FromLong(
11330 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011332 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 for (i = 0; i < length; i++) {
11337 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011338 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011346\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011347Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011349
11350static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 Py_ssize_t i, length;
11354 int kind;
11355 void *data;
11356
11357 if (PyUnicode_READY(self) == -1)
11358 return NULL;
11359 length = PyUnicode_GET_LENGTH(self);
11360 kind = PyUnicode_KIND(self);
11361 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011362
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011363 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (length == 1)
11365 return PyBool_FromLong(
11366 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367
11368 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 for (i = 0; i < length; i++) {
11373 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011374 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011376 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011377}
11378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011379PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011381\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011382Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011383and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011384
11385static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011386unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 int kind;
11389 void *data;
11390 Py_ssize_t len, i;
11391
11392 if (PyUnicode_READY(self) == -1)
11393 return NULL;
11394
11395 kind = PyUnicode_KIND(self);
11396 data = PyUnicode_DATA(self);
11397 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011398
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011399 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (len == 1) {
11401 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11402 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11403 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404
11405 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 for (i = 0; i < len; i++) {
11410 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011411 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011413 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011414 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011415}
11416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011417PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011418 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011420Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011421False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
11423static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011424unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 Py_ssize_t i, length;
11427 int kind;
11428 void *data;
11429
11430 if (PyUnicode_READY(self) == -1)
11431 return NULL;
11432 length = PyUnicode_GET_LENGTH(self);
11433 kind = PyUnicode_KIND(self);
11434 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 if (length == 1)
11438 return PyBool_FromLong(
11439 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011441 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 for (i = 0; i < length; i++) {
11446 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011449 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450}
11451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011452PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011453 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011455Return True if all characters in S are digits\n\
11456and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
11458static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011459unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 Py_ssize_t i, length;
11462 int kind;
11463 void *data;
11464
11465 if (PyUnicode_READY(self) == -1)
11466 return NULL;
11467 length = PyUnicode_GET_LENGTH(self);
11468 kind = PyUnicode_KIND(self);
11469 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 if (length == 1) {
11473 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11474 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11475 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011477 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 for (i = 0; i < length; i++) {
11482 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011485 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486}
11487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011491Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493
11494static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011495unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 Py_ssize_t i, length;
11498 int kind;
11499 void *data;
11500
11501 if (PyUnicode_READY(self) == -1)
11502 return NULL;
11503 length = PyUnicode_GET_LENGTH(self);
11504 kind = PyUnicode_KIND(self);
11505 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 if (length == 1)
11509 return PyBool_FromLong(
11510 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011512 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 for (i = 0; i < length; i++) {
11517 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011520 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521}
11522
Martin v. Löwis47383402007-08-15 07:32:56 +000011523int
11524PyUnicode_IsIdentifier(PyObject *self)
11525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 int kind;
11527 void *data;
11528 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011529 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 if (PyUnicode_READY(self) == -1) {
11532 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 }
11535
11536 /* Special case for empty strings */
11537 if (PyUnicode_GET_LENGTH(self) == 0)
11538 return 0;
11539 kind = PyUnicode_KIND(self);
11540 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011541
11542 /* PEP 3131 says that the first character must be in
11543 XID_Start and subsequent characters in XID_Continue,
11544 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011545 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011546 letters, digits, underscore). However, given the current
11547 definition of XID_Start and XID_Continue, it is sufficient
11548 to check just for these, except that _ must be allowed
11549 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011551 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011552 return 0;
11553
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011554 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011557 return 1;
11558}
11559
11560PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011562\n\
11563Return True if S is a valid identifier according\n\
11564to the language definition.");
11565
11566static PyObject*
11567unicode_isidentifier(PyObject *self)
11568{
11569 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11570}
11571
Georg Brandl559e5d72008-06-11 18:37:52 +000011572PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011574\n\
11575Return True if all characters in S are considered\n\
11576printable in repr() or S is empty, False otherwise.");
11577
11578static PyObject*
11579unicode_isprintable(PyObject *self)
11580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 Py_ssize_t i, length;
11582 int kind;
11583 void *data;
11584
11585 if (PyUnicode_READY(self) == -1)
11586 return NULL;
11587 length = PyUnicode_GET_LENGTH(self);
11588 kind = PyUnicode_KIND(self);
11589 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011590
11591 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 if (length == 1)
11593 return PyBool_FromLong(
11594 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 for (i = 0; i < length; i++) {
11597 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011598 Py_RETURN_FALSE;
11599 }
11600 }
11601 Py_RETURN_TRUE;
11602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011605 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
11607Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011608iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
11610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011611unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011613 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis18e16552006-02-15 17:27:45 +000011616static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011617unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if (PyUnicode_READY(self) == -1)
11620 return -1;
11621 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011627Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011628done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011631unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011633 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 Py_UCS4 fillchar = ' ';
11635
11636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011638
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011639 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640 return NULL;
11641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011644 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 }
11646
Victor Stinner7931d9a2011-11-04 00:22:48 +010011647 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648}
11649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011650PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011653Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654
11655static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011656unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658 return fixup(self, fixlower);
11659}
11660
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661#define LEFTSTRIP 0
11662#define RIGHTSTRIP 1
11663#define BOTHSTRIP 2
11664
11665/* Arrays indexed by above */
11666static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11667
11668#define STRIPNAME(i) (stripformat[i]+3)
11669
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011670/* externally visible for str.strip(unicode) */
11671PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011672_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 void *data;
11675 int kind;
11676 Py_ssize_t i, j, len;
11677 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11680 return NULL;
11681
11682 kind = PyUnicode_KIND(self);
11683 data = PyUnicode_DATA(self);
11684 len = PyUnicode_GET_LENGTH(self);
11685 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11686 PyUnicode_DATA(sepobj),
11687 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011688
Benjamin Peterson14339b62009-01-31 16:36:08 +000011689 i = 0;
11690 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 while (i < len &&
11692 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 i++;
11694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 j = len;
11698 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011699 do {
11700 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 } while (j >= i &&
11702 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011704 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011705
Victor Stinner7931d9a2011-11-04 00:22:48 +010011706 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707}
11708
11709PyObject*
11710PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11711{
11712 unsigned char *data;
11713 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011714 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715
Victor Stinnerde636f32011-10-01 03:55:54 +020011716 if (PyUnicode_READY(self) == -1)
11717 return NULL;
11718
11719 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11720
Victor Stinner12bab6d2011-10-01 01:53:49 +020011721 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011723 if (PyUnicode_CheckExact(self)) {
11724 Py_INCREF(self);
11725 return self;
11726 }
11727 else
11728 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011729 }
11730
Victor Stinner12bab6d2011-10-01 01:53:49 +020011731 length = end - start;
11732 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011733 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734
Victor Stinnerde636f32011-10-01 03:55:54 +020011735 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011736 PyErr_SetString(PyExc_IndexError, "string index out of range");
11737 return NULL;
11738 }
11739
Victor Stinnerb9275c12011-10-05 14:01:42 +020011740 if (PyUnicode_IS_ASCII(self)) {
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_1BYTE_DATA(self);
11743 return unicode_fromascii(data + start, length);
11744 }
11745 else {
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_1BYTE_DATA(self);
11748 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011749 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011750 length);
11751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
11754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011755do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 int kind;
11758 void *data;
11759 Py_ssize_t len, i, j;
11760
11761 if (PyUnicode_READY(self) == -1)
11762 return NULL;
11763
11764 kind = PyUnicode_KIND(self);
11765 data = PyUnicode_DATA(self);
11766 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 i = 0;
11769 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011771 i++;
11772 }
11773 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774
Benjamin Peterson14339b62009-01-31 16:36:08 +000011775 j = len;
11776 if (striptype != LEFTSTRIP) {
11777 do {
11778 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 j++;
11781 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782
Victor Stinner7931d9a2011-11-04 00:22:48 +010011783 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784}
11785
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786
11787static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011788do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011789{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11793 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 if (sep != NULL && sep != Py_None) {
11796 if (PyUnicode_Check(sep))
11797 return _PyUnicode_XStrip(self, striptype, sep);
11798 else {
11799 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011800 "%s arg must be None or str",
11801 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 return NULL;
11803 }
11804 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011805
Benjamin Peterson14339b62009-01-31 16:36:08 +000011806 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807}
11808
11809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011810PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812\n\
11813Return a copy of the string S with leading and trailing\n\
11814whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011815If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011816
11817static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011818unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011820 if (PyTuple_GET_SIZE(args) == 0)
11821 return do_strip(self, BOTHSTRIP); /* Common case */
11822 else
11823 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824}
11825
11826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011827PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829\n\
11830Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011831If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832
11833static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011834unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011835{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011836 if (PyTuple_GET_SIZE(args) == 0)
11837 return do_strip(self, LEFTSTRIP); /* Common case */
11838 else
11839 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840}
11841
11842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011843PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845\n\
11846Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011847If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848
11849static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011850unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011851{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011852 if (PyTuple_GET_SIZE(args) == 0)
11853 return do_strip(self, RIGHTSTRIP); /* Common case */
11854 else
11855 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856}
11857
11858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011860unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011862 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864
Georg Brandl222de0f2009-04-12 12:01:50 +000011865 if (len < 1) {
11866 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011867 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
Tim Peters7a29bd52001-09-12 03:03:31 +000011870 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 /* no repeat, return original string */
11872 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011873 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874 }
Tim Peters8f422462000-09-09 06:13:41 +000011875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (PyUnicode_READY(str) == -1)
11877 return NULL;
11878
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011879 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011880 PyErr_SetString(PyExc_OverflowError,
11881 "repeated string is too long");
11882 return NULL;
11883 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011885
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011886 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 if (!u)
11888 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011889 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 if (PyUnicode_GET_LENGTH(str) == 1) {
11892 const int kind = PyUnicode_KIND(str);
11893 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11894 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011895 if (kind == PyUnicode_1BYTE_KIND)
11896 memset(to, (unsigned char)fill_char, len);
11897 else {
11898 for (n = 0; n < len; ++n)
11899 PyUnicode_WRITE(kind, to, n, fill_char);
11900 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 }
11902 else {
11903 /* number of characters copied this far */
11904 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011905 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 char *to = (char *) PyUnicode_DATA(u);
11907 Py_MEMCPY(to, PyUnicode_DATA(str),
11908 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 n = (done <= nchars-done) ? done : nchars-done;
11911 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011912 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914 }
11915
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011916 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011917 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918}
11919
Alexander Belopolsky40018472011-02-26 01:02:56 +000011920PyObject *
11921PyUnicode_Replace(PyObject *obj,
11922 PyObject *subobj,
11923 PyObject *replobj,
11924 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
11926 PyObject *self;
11927 PyObject *str1;
11928 PyObject *str2;
11929 PyObject *result;
11930
11931 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011932 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011935 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 Py_DECREF(self);
11937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 }
11939 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011940 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 Py_DECREF(self);
11942 Py_DECREF(str1);
11943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 Py_DECREF(self);
11947 Py_DECREF(str1);
11948 Py_DECREF(str2);
11949 return result;
11950}
11951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011952PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011953 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954\n\
11955Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011956old replaced by new. If the optional argument count is\n\
11957given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 PyObject *str1;
11963 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011964 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 PyObject *result;
11966
Martin v. Löwis18e16552006-02-15 17:27:45 +000011967 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011970 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 str1 = PyUnicode_FromObject(str1);
11972 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11973 return NULL;
11974 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011975 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011976 Py_DECREF(str1);
11977 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980 result = replace(self, str1, str2, maxcount);
11981
11982 Py_DECREF(str1);
11983 Py_DECREF(str2);
11984 return result;
11985}
11986
Alexander Belopolsky40018472011-02-26 01:02:56 +000011987static PyObject *
11988unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011990 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_ssize_t isize;
11992 Py_ssize_t osize, squote, dquote, i, o;
11993 Py_UCS4 max, quote;
11994 int ikind, okind;
11995 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998 return NULL;
11999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 isize = PyUnicode_GET_LENGTH(unicode);
12001 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 /* Compute length of output, quote characters, and
12004 maximum character */
12005 osize = 2; /* quotes */
12006 max = 127;
12007 squote = dquote = 0;
12008 ikind = PyUnicode_KIND(unicode);
12009 for (i = 0; i < isize; i++) {
12010 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12011 switch (ch) {
12012 case '\'': squote++; osize++; break;
12013 case '"': dquote++; osize++; break;
12014 case '\\': case '\t': case '\r': case '\n':
12015 osize += 2; break;
12016 default:
12017 /* Fast-path ASCII */
12018 if (ch < ' ' || ch == 0x7f)
12019 osize += 4; /* \xHH */
12020 else if (ch < 0x7f)
12021 osize++;
12022 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12023 osize++;
12024 max = ch > max ? ch : max;
12025 }
12026 else if (ch < 0x100)
12027 osize += 4; /* \xHH */
12028 else if (ch < 0x10000)
12029 osize += 6; /* \uHHHH */
12030 else
12031 osize += 10; /* \uHHHHHHHH */
12032 }
12033 }
12034
12035 quote = '\'';
12036 if (squote) {
12037 if (dquote)
12038 /* Both squote and dquote present. Use squote,
12039 and escape them */
12040 osize += squote;
12041 else
12042 quote = '"';
12043 }
12044
12045 repr = PyUnicode_New(osize, max);
12046 if (repr == NULL)
12047 return NULL;
12048 okind = PyUnicode_KIND(repr);
12049 odata = PyUnicode_DATA(repr);
12050
12051 PyUnicode_WRITE(okind, odata, 0, quote);
12052 PyUnicode_WRITE(okind, odata, osize-1, quote);
12053
12054 for (i = 0, o = 1; i < isize; i++) {
12055 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012056
12057 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if ((ch == quote) || (ch == '\\')) {
12059 PyUnicode_WRITE(okind, odata, o++, '\\');
12060 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012061 continue;
12062 }
12063
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 PyUnicode_WRITE(okind, odata, o++, '\\');
12067 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012068 }
12069 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 PyUnicode_WRITE(okind, odata, o++, '\\');
12071 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 }
12073 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 PyUnicode_WRITE(okind, odata, o++, '\\');
12075 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076 }
12077
12078 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012079 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 PyUnicode_WRITE(okind, odata, o++, '\\');
12081 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012084 }
12085
Georg Brandl559e5d72008-06-11 18:37:52 +000012086 /* Copy ASCII characters as-is */
12087 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012089 }
12090
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012092 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 (categories Z* and C* except ASCII space)
12095 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if (ch <= 0xff) {
12099 PyUnicode_WRITE(okind, odata, o++, '\\');
12100 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 }
12104 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 else if (ch >= 0x10000) {
12106 PyUnicode_WRITE(okind, odata, o++, '\\');
12107 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012108 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12109 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12111 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12112 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012116 }
12117 /* Map 16-bit characters to '\uxxxx' */
12118 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 PyUnicode_WRITE(okind, odata, o++, '\\');
12120 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012125 }
12126 }
12127 /* Copy characters as-is */
12128 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012130 }
12131 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012134 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012135 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012138PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140\n\
12141Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012142such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143arguments start and end are interpreted as in slice notation.\n\
12144\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012150 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012151 Py_ssize_t start;
12152 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012153 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Jesus Ceaac451502011-04-20 17:09:23 +020012155 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12156 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (PyUnicode_READY(self) == -1)
12160 return NULL;
12161 if (PyUnicode_READY(substring) == -1)
12162 return NULL;
12163
Victor Stinner7931d9a2011-11-04 00:22:48 +010012164 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
12166 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168 if (result == -2)
12169 return NULL;
12170
Christian Heimes217cfd12007-12-02 14:31:20 +000012171 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172}
12173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012174PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012175 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012177Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
12179static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012182 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012183 Py_ssize_t start;
12184 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012185 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Jesus Ceaac451502011-04-20 17:09:23 +020012187 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12188 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012189 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (PyUnicode_READY(self) == -1)
12192 return NULL;
12193 if (PyUnicode_READY(substring) == -1)
12194 return NULL;
12195
Victor Stinner7931d9a2011-11-04 00:22:48 +010012196 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
12198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 if (result == -2)
12201 return NULL;
12202
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 if (result < 0) {
12204 PyErr_SetString(PyExc_ValueError, "substring not found");
12205 return NULL;
12206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207
Christian Heimes217cfd12007-12-02 14:31:20 +000012208 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209}
12210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012211PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012212 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012214Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012215done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
12217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012218unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012220 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 Py_UCS4 fillchar = ' ';
12222
Victor Stinnere9a29352011-10-01 02:14:59 +020012223 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012225
Victor Stinnere9a29352011-10-01 02:14:59 +020012226 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 return NULL;
12228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012231 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
12233
Victor Stinner7931d9a2011-11-04 00:22:48 +010012234 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235}
12236
Alexander Belopolsky40018472011-02-26 01:02:56 +000012237PyObject *
12238PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
12240 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012241
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 s = PyUnicode_FromObject(s);
12243 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012244 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 if (sep != NULL) {
12246 sep = PyUnicode_FromObject(sep);
12247 if (sep == NULL) {
12248 Py_DECREF(s);
12249 return NULL;
12250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 }
12252
Victor Stinner9310abb2011-10-05 00:59:23 +020012253 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
12255 Py_DECREF(s);
12256 Py_XDECREF(sep);
12257 return result;
12258}
12259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012260PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012261 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262\n\
12263Return a list of the words in S, using sep as the\n\
12264delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012265splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012266whitespace string is a separator and empty strings are\n\
12267removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012270unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271{
12272 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
Martin v. Löwis18e16552006-02-15 17:27:45 +000012275 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 return NULL;
12277
12278 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012281 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012283 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286PyObject *
12287PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12288{
12289 PyObject* str_obj;
12290 PyObject* sep_obj;
12291 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 int kind1, kind2, kind;
12293 void *buf1 = NULL, *buf2 = NULL;
12294 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295
12296 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012297 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301 Py_DECREF(str_obj);
12302 return NULL;
12303 }
12304
Victor Stinner14f8f022011-10-05 20:58:25 +020012305 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012307 kind = Py_MAX(kind1, kind2);
12308 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012310 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (!buf1)
12312 goto onError;
12313 buf2 = PyUnicode_DATA(sep_obj);
12314 if (kind2 != kind)
12315 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12316 if (!buf2)
12317 goto onError;
12318 len1 = PyUnicode_GET_LENGTH(str_obj);
12319 len2 = PyUnicode_GET_LENGTH(sep_obj);
12320
Victor Stinner14f8f022011-10-05 20:58:25 +020012321 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012323 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12324 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12325 else
12326 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 break;
12328 case PyUnicode_2BYTE_KIND:
12329 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 break;
12331 case PyUnicode_4BYTE_KIND:
12332 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12333 break;
12334 default:
12335 assert(0);
12336 out = 0;
12337 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012338
12339 Py_DECREF(sep_obj);
12340 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 if (kind1 != kind)
12342 PyMem_Free(buf1);
12343 if (kind2 != kind)
12344 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345
12346 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 onError:
12348 Py_DECREF(sep_obj);
12349 Py_DECREF(str_obj);
12350 if (kind1 != kind && buf1)
12351 PyMem_Free(buf1);
12352 if (kind2 != kind && buf2)
12353 PyMem_Free(buf2);
12354 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012355}
12356
12357
12358PyObject *
12359PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12360{
12361 PyObject* str_obj;
12362 PyObject* sep_obj;
12363 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 int kind1, kind2, kind;
12365 void *buf1 = NULL, *buf2 = NULL;
12366 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012367
12368 str_obj = PyUnicode_FromObject(str_in);
12369 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012370 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012371 sep_obj = PyUnicode_FromObject(sep_in);
12372 if (!sep_obj) {
12373 Py_DECREF(str_obj);
12374 return NULL;
12375 }
12376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 kind1 = PyUnicode_KIND(str_in);
12378 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012379 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 buf1 = PyUnicode_DATA(str_in);
12381 if (kind1 != kind)
12382 buf1 = _PyUnicode_AsKind(str_in, kind);
12383 if (!buf1)
12384 goto onError;
12385 buf2 = PyUnicode_DATA(sep_obj);
12386 if (kind2 != kind)
12387 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12388 if (!buf2)
12389 goto onError;
12390 len1 = PyUnicode_GET_LENGTH(str_obj);
12391 len2 = PyUnicode_GET_LENGTH(sep_obj);
12392
12393 switch(PyUnicode_KIND(str_in)) {
12394 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012395 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12396 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12397 else
12398 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 break;
12400 case PyUnicode_2BYTE_KIND:
12401 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 break;
12403 case PyUnicode_4BYTE_KIND:
12404 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12405 break;
12406 default:
12407 assert(0);
12408 out = 0;
12409 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012410
12411 Py_DECREF(sep_obj);
12412 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 if (kind1 != kind)
12414 PyMem_Free(buf1);
12415 if (kind2 != kind)
12416 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012417
12418 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012419 onError:
12420 Py_DECREF(sep_obj);
12421 Py_DECREF(str_obj);
12422 if (kind1 != kind && buf1)
12423 PyMem_Free(buf1);
12424 if (kind2 != kind && buf2)
12425 PyMem_Free(buf2);
12426 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012427}
12428
12429PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012431\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012432Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012433the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012434found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435
12436static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012437unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438{
Victor Stinner9310abb2011-10-05 00:59:23 +020012439 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440}
12441
12442PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012443 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012444\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012445Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012447separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448
12449static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012450unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451{
Victor Stinner9310abb2011-10-05 00:59:23 +020012452 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453}
12454
Alexander Belopolsky40018472011-02-26 01:02:56 +000012455PyObject *
12456PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012457{
12458 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012460 s = PyUnicode_FromObject(s);
12461 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 if (sep != NULL) {
12464 sep = PyUnicode_FromObject(sep);
12465 if (sep == NULL) {
12466 Py_DECREF(s);
12467 return NULL;
12468 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469 }
12470
Victor Stinner9310abb2011-10-05 00:59:23 +020012471 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012472
12473 Py_DECREF(s);
12474 Py_XDECREF(sep);
12475 return result;
12476}
12477
12478PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012479 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012480\n\
12481Return a list of the words in S, using sep as the\n\
12482delimiter string, starting at the end of the string and\n\
12483working to the front. If maxsplit is given, at most maxsplit\n\
12484splits are done. If sep is not specified, any whitespace string\n\
12485is a separator.");
12486
12487static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012488unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012489{
12490 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012491 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492
Martin v. Löwis18e16552006-02-15 17:27:45 +000012493 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494 return NULL;
12495
12496 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012498 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012499 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012501 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012502}
12503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012504PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506\n\
12507Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012508Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510
12511static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012512unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012513{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012514 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012515 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012517 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12518 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519 return NULL;
12520
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012521 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522}
12523
12524static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012525PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Walter Dörwald346737f2007-05-31 10:44:43 +000012527 if (PyUnicode_CheckExact(self)) {
12528 Py_INCREF(self);
12529 return self;
12530 } else
12531 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012532 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012535PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537\n\
12538Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
12541static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012542unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544 return fixup(self, fixswapcase);
12545}
12546
Georg Brandlceee0772007-11-27 23:48:05 +000012547PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012549\n\
12550Return a translation table usable for str.translate().\n\
12551If there is only one argument, it must be a dictionary mapping Unicode\n\
12552ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012553Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012554If there are two arguments, they must be strings of equal length, and\n\
12555in the resulting dictionary, each character in x will be mapped to the\n\
12556character at the same position in y. If there is a third argument, it\n\
12557must be a string, whose characters will be mapped to None in the result.");
12558
12559static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012560unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012561{
12562 PyObject *x, *y = NULL, *z = NULL;
12563 PyObject *new = NULL, *key, *value;
12564 Py_ssize_t i = 0;
12565 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012566
Georg Brandlceee0772007-11-27 23:48:05 +000012567 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12568 return NULL;
12569 new = PyDict_New();
12570 if (!new)
12571 return NULL;
12572 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012573 int x_kind, y_kind, z_kind;
12574 void *x_data, *y_data, *z_data;
12575
Georg Brandlceee0772007-11-27 23:48:05 +000012576 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012577 if (!PyUnicode_Check(x)) {
12578 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12579 "be a string if there is a second argument");
12580 goto err;
12581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012583 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12584 "arguments must have equal length");
12585 goto err;
12586 }
12587 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012588 x_kind = PyUnicode_KIND(x);
12589 y_kind = PyUnicode_KIND(y);
12590 x_data = PyUnicode_DATA(x);
12591 y_data = PyUnicode_DATA(y);
12592 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12593 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12594 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012595 if (!key || !value)
12596 goto err;
12597 res = PyDict_SetItem(new, key, value);
12598 Py_DECREF(key);
12599 Py_DECREF(value);
12600 if (res < 0)
12601 goto err;
12602 }
12603 /* create entries for deleting chars in z */
12604 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 z_kind = PyUnicode_KIND(z);
12606 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012607 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012609 if (!key)
12610 goto err;
12611 res = PyDict_SetItem(new, key, Py_None);
12612 Py_DECREF(key);
12613 if (res < 0)
12614 goto err;
12615 }
12616 }
12617 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 int kind;
12619 void *data;
12620
Georg Brandlceee0772007-11-27 23:48:05 +000012621 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012622 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012623 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12624 "to maketrans it must be a dict");
12625 goto err;
12626 }
12627 /* copy entries into the new dict, converting string keys to int keys */
12628 while (PyDict_Next(x, &i, &key, &value)) {
12629 if (PyUnicode_Check(key)) {
12630 /* convert string keys to integer keys */
12631 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012632 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012633 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12634 "table must be of length 1");
12635 goto err;
12636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 kind = PyUnicode_KIND(key);
12638 data = PyUnicode_DATA(key);
12639 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012640 if (!newkey)
12641 goto err;
12642 res = PyDict_SetItem(new, newkey, value);
12643 Py_DECREF(newkey);
12644 if (res < 0)
12645 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012646 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012647 /* just keep integer keys */
12648 if (PyDict_SetItem(new, key, value) < 0)
12649 goto err;
12650 } else {
12651 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12652 "be strings or integers");
12653 goto err;
12654 }
12655 }
12656 }
12657 return new;
12658 err:
12659 Py_DECREF(new);
12660 return NULL;
12661}
12662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012663PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665\n\
12666Return a copy of the string S, where all characters have been mapped\n\
12667through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012668Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012669Unmapped characters are left untouched. Characters mapped to None\n\
12670are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
12672static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676}
12677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012678PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012681Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
12683static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012684unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686 return fixup(self, fixupper);
12687}
12688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012692Pad a numeric string S with zeros on the left, to fill a field\n\
12693of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012696unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012698 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012699 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012700 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 int kind;
12702 void *data;
12703 Py_UCS4 chr;
12704
12705 if (PyUnicode_READY(self) == -1)
12706 return NULL;
12707
Martin v. Löwis18e16552006-02-15 17:27:45 +000012708 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709 return NULL;
12710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012712 if (PyUnicode_CheckExact(self)) {
12713 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012714 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012715 }
12716 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012717 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718 }
12719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721
12722 u = pad(self, fill, 0, '0');
12723
Walter Dörwald068325e2002-04-15 13:36:47 +000012724 if (u == NULL)
12725 return NULL;
12726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 kind = PyUnicode_KIND(u);
12728 data = PyUnicode_DATA(u);
12729 chr = PyUnicode_READ(kind, data, fill);
12730
12731 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 PyUnicode_WRITE(kind, data, 0, chr);
12734 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 }
12736
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012737 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012738 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
12741#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012742static PyObject *
12743unicode__decimal2ascii(PyObject *self)
12744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012746}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747#endif
12748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012749PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012752Return True if S starts with the specified prefix, False otherwise.\n\
12753With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012754With optional end, stop comparing S at that position.\n\
12755prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756
12757static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012758unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012761 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012762 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012763 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012764 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012765 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766
Jesus Ceaac451502011-04-20 17:09:23 +020012767 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012768 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012769 if (PyTuple_Check(subobj)) {
12770 Py_ssize_t i;
12771 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012772 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 if (substring == NULL)
12774 return NULL;
12775 result = tailmatch(self, substring, start, end, -1);
12776 Py_DECREF(substring);
12777 if (result) {
12778 Py_RETURN_TRUE;
12779 }
12780 }
12781 /* nothing matched */
12782 Py_RETURN_FALSE;
12783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012784 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012785 if (substring == NULL) {
12786 if (PyErr_ExceptionMatches(PyExc_TypeError))
12787 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12788 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012790 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012791 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794}
12795
12796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012797PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012800Return True if S ends with the specified suffix, False otherwise.\n\
12801With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012802With optional end, stop comparing S at that position.\n\
12803suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804
12805static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012806unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012807 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012809 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012810 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012811 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012812 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814
Jesus Ceaac451502011-04-20 17:09:23 +020012815 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012816 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012817 if (PyTuple_Check(subobj)) {
12818 Py_ssize_t i;
12819 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012820 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012824 result = tailmatch(self, substring, start, end, +1);
12825 Py_DECREF(substring);
12826 if (result) {
12827 Py_RETURN_TRUE;
12828 }
12829 }
12830 Py_RETURN_FALSE;
12831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012832 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012833 if (substring == NULL) {
12834 if (PyErr_ExceptionMatches(PyExc_TypeError))
12835 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12836 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012837 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012838 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012839 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012841 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842}
12843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012845
12846PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012848\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012849Return a formatted version of S, using substitutions from args and kwargs.\n\
12850The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012851
Eric Smith27bbca62010-11-04 17:06:58 +000012852PyDoc_STRVAR(format_map__doc__,
12853 "S.format_map(mapping) -> str\n\
12854\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012855Return a formatted version of S, using substitutions from mapping.\n\
12856The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012857
Eric Smith4a7d76d2008-05-30 18:10:19 +000012858static PyObject *
12859unicode__format__(PyObject* self, PyObject* args)
12860{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012861 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012862
12863 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12864 return NULL;
12865
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012866 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012868 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012869}
12870
Eric Smith8c663262007-08-25 02:26:07 +000012871PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012873\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012874Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012875
12876static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012877unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012878{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 Py_ssize_t size;
12880
12881 /* If it's a compact object, account for base structure +
12882 character data. */
12883 if (PyUnicode_IS_COMPACT_ASCII(v))
12884 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12885 else if (PyUnicode_IS_COMPACT(v))
12886 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012887 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 else {
12889 /* If it is a two-block object, account for base object, and
12890 for character block if present. */
12891 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012892 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012894 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 }
12896 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012897 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012898 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012900 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012901 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012902
12903 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012904}
12905
12906PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012908
12909static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012910unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012911{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012912 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 if (!copy)
12914 return NULL;
12915 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012916}
12917
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918static PyMethodDef unicode_methods[] = {
12919
12920 /* Order is according to common usage: often used methods should
12921 appear first, since lookup is done sequentially. */
12922
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012923 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012924 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12925 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012926 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012927 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12928 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12929 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12930 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12931 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12932 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12933 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012934 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012935 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12936 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12937 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012938 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012939 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12940 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12941 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012942 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012943 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012944 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012945 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012946 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12947 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12948 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12949 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12950 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12951 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12952 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12953 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12954 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12955 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12956 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12957 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12958 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12959 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012960 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012961 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012962 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012963 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012964 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012965 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012966 {"maketrans", (PyCFunction) unicode_maketrans,
12967 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012968 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012969#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012970 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971#endif
12972
12973#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012974 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012975 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976#endif
12977
Benjamin Peterson14339b62009-01-31 16:36:08 +000012978 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979 {NULL, NULL}
12980};
12981
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012982static PyObject *
12983unicode_mod(PyObject *v, PyObject *w)
12984{
Brian Curtindfc80e32011-08-10 20:28:54 -050012985 if (!PyUnicode_Check(v))
12986 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012988}
12989
12990static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012991 0, /*nb_add*/
12992 0, /*nb_subtract*/
12993 0, /*nb_multiply*/
12994 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012995};
12996
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 (lenfunc) unicode_length, /* sq_length */
12999 PyUnicode_Concat, /* sq_concat */
13000 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13001 (ssizeargfunc) unicode_getitem, /* sq_item */
13002 0, /* sq_slice */
13003 0, /* sq_ass_item */
13004 0, /* sq_ass_slice */
13005 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006};
13007
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013008static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013009unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 if (PyUnicode_READY(self) == -1)
13012 return NULL;
13013
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013014 if (PyIndex_Check(item)) {
13015 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016 if (i == -1 && PyErr_Occurred())
13017 return NULL;
13018 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013020 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013022 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013023 PyObject *result;
13024 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013025 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013026 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013029 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013030 return NULL;
13031 }
13032
13033 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 return PyUnicode_New(0, 0);
13035 } else if (start == 0 && step == 1 &&
13036 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013037 PyUnicode_CheckExact(self)) {
13038 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013039 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013040 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013041 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013042 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013043 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013044 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013045 src_kind = PyUnicode_KIND(self);
13046 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013047 if (!PyUnicode_IS_ASCII(self)) {
13048 kind_limit = kind_maxchar_limit(src_kind);
13049 max_char = 0;
13050 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13051 ch = PyUnicode_READ(src_kind, src_data, cur);
13052 if (ch > max_char) {
13053 max_char = ch;
13054 if (max_char >= kind_limit)
13055 break;
13056 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013057 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 }
Victor Stinner55c99112011-10-13 01:17:06 +020013059 else
13060 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013061 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013062 if (result == NULL)
13063 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013064 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013065 dest_data = PyUnicode_DATA(result);
13066
13067 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013068 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13069 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013070 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013071 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013072 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073 } else {
13074 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13075 return NULL;
13076 }
13077}
13078
13079static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 (lenfunc)unicode_length, /* mp_length */
13081 (binaryfunc)unicode_subscript, /* mp_subscript */
13082 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013083};
13084
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086/* Helpers for PyUnicode_Format() */
13087
13088static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013089getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013091 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013093 (*p_argidx)++;
13094 if (arglen < 0)
13095 return args;
13096 else
13097 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 }
13099 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 return NULL;
13102}
13103
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013104/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013106static PyObject *
13107formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013109 char *p;
13110 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013112
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 x = PyFloat_AsDouble(v);
13114 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013115 return NULL;
13116
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013119
Eric Smith0923d1d2009-04-16 20:16:10 +000013120 p = PyOS_double_to_string(x, type, prec,
13121 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013122 if (p == NULL)
13123 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013125 PyMem_Free(p);
13126 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127}
13128
Tim Peters38fd5b62000-09-21 05:43:11 +000013129static PyObject*
13130formatlong(PyObject *val, int flags, int prec, int type)
13131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 char *buf;
13133 int len;
13134 PyObject *str; /* temporary string object. */
13135 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013136
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13138 if (!str)
13139 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 Py_DECREF(str);
13142 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013143}
13144
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013145static Py_UCS4
13146formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013148 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013149 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013151 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 goto onError;
13154 }
13155 else {
13156 /* Integer input truncated to a character */
13157 long x;
13158 x = PyLong_AsLong(v);
13159 if (x == -1 && PyErr_Occurred())
13160 goto onError;
13161
13162 if (x < 0 || x > 0x10ffff) {
13163 PyErr_SetString(PyExc_OverflowError,
13164 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013165 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 }
13167
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013168 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013169 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013170
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013172 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013174 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175}
13176
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013177static int
13178repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13179{
13180 int r;
13181 assert(count > 0);
13182 assert(PyUnicode_Check(obj));
13183 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013184 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013185 if (repeated == NULL)
13186 return -1;
13187 r = _PyAccu_Accumulate(acc, repeated);
13188 Py_DECREF(repeated);
13189 return r;
13190 }
13191 else {
13192 do {
13193 if (_PyAccu_Accumulate(acc, obj))
13194 return -1;
13195 } while (--count);
13196 return 0;
13197 }
13198}
13199
Alexander Belopolsky40018472011-02-26 01:02:56 +000013200PyObject *
13201PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 void *fmt;
13204 int fmtkind;
13205 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013207 int r;
13208 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013211 PyObject *temp = NULL;
13212 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013213 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013214 _PyAccu acc;
13215 static PyObject *plus, *minus, *blank, *zero, *percent;
13216
13217 if (!plus && !(plus = get_latin1_char('+')))
13218 return NULL;
13219 if (!minus && !(minus = get_latin1_char('-')))
13220 return NULL;
13221 if (!blank && !(blank = get_latin1_char(' ')))
13222 return NULL;
13223 if (!zero && !(zero = get_latin1_char('0')))
13224 return NULL;
13225 if (!percent && !(percent = get_latin1_char('%')))
13226 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013227
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 PyErr_BadInternalCall();
13230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013232 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013235 if (_PyAccu_Init(&acc))
13236 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 fmt = PyUnicode_DATA(uformat);
13238 fmtkind = PyUnicode_KIND(uformat);
13239 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13240 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 arglen = PyTuple_Size(args);
13244 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245 }
13246 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 arglen = -1;
13248 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013250 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013251 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253
13254 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013256 PyObject *nonfmt;
13257 Py_ssize_t nonfmtpos;
13258 nonfmtpos = fmtpos++;
13259 while (fmtcnt >= 0 &&
13260 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13261 fmtpos++;
13262 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013264 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013265 if (nonfmt == NULL)
13266 goto onError;
13267 r = _PyAccu_Accumulate(&acc, nonfmt);
13268 Py_DECREF(nonfmt);
13269 if (r)
13270 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 }
13272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 /* Got a format specifier */
13274 int flags = 0;
13275 Py_ssize_t width = -1;
13276 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 int isnumok;
13280 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013281 void *pbuf = NULL;
13282 Py_ssize_t pindex, len;
13283 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 fmtpos++;
13286 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13287 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 Py_ssize_t keylen;
13289 PyObject *key;
13290 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013291
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 if (dict == NULL) {
13293 PyErr_SetString(PyExc_TypeError,
13294 "format requires a mapping");
13295 goto onError;
13296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 /* Skip over balanced parentheses */
13301 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 if (fmtcnt < 0 || pcount > 0) {
13310 PyErr_SetString(PyExc_ValueError,
13311 "incomplete format key");
13312 goto onError;
13313 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013314 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013315 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 if (key == NULL)
13317 goto onError;
13318 if (args_owned) {
13319 Py_DECREF(args);
13320 args_owned = 0;
13321 }
13322 args = PyObject_GetItem(dict, key);
13323 Py_DECREF(key);
13324 if (args == NULL) {
13325 goto onError;
13326 }
13327 args_owned = 1;
13328 arglen = -1;
13329 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013330 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013331 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 case '-': flags |= F_LJUST; continue;
13334 case '+': flags |= F_SIGN; continue;
13335 case ' ': flags |= F_BLANK; continue;
13336 case '#': flags |= F_ALT; continue;
13337 case '0': flags |= F_ZERO; continue;
13338 }
13339 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013340 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 if (c == '*') {
13342 v = getnextarg(args, arglen, &argidx);
13343 if (v == NULL)
13344 goto onError;
13345 if (!PyLong_Check(v)) {
13346 PyErr_SetString(PyExc_TypeError,
13347 "* wants int");
13348 goto onError;
13349 }
13350 width = PyLong_AsLong(v);
13351 if (width == -1 && PyErr_Occurred())
13352 goto onError;
13353 if (width < 0) {
13354 flags |= F_LJUST;
13355 width = -width;
13356 }
13357 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 }
13360 else if (c >= '0' && c <= '9') {
13361 width = c - '0';
13362 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 if (c < '0' || c > '9')
13365 break;
13366 if ((width*10) / 10 != width) {
13367 PyErr_SetString(PyExc_ValueError,
13368 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 }
13371 width = width*10 + (c - '0');
13372 }
13373 }
13374 if (c == '.') {
13375 prec = 0;
13376 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 if (c == '*') {
13379 v = getnextarg(args, arglen, &argidx);
13380 if (v == NULL)
13381 goto onError;
13382 if (!PyLong_Check(v)) {
13383 PyErr_SetString(PyExc_TypeError,
13384 "* wants int");
13385 goto onError;
13386 }
13387 prec = PyLong_AsLong(v);
13388 if (prec == -1 && PyErr_Occurred())
13389 goto onError;
13390 if (prec < 0)
13391 prec = 0;
13392 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013393 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
13395 else if (c >= '0' && c <= '9') {
13396 prec = c - '0';
13397 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 if (c < '0' || c > '9')
13400 break;
13401 if ((prec*10) / 10 != prec) {
13402 PyErr_SetString(PyExc_ValueError,
13403 "prec too big");
13404 goto onError;
13405 }
13406 prec = prec*10 + (c - '0');
13407 }
13408 }
13409 } /* prec */
13410 if (fmtcnt >= 0) {
13411 if (c == 'h' || c == 'l' || c == 'L') {
13412 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 }
13415 }
13416 if (fmtcnt < 0) {
13417 PyErr_SetString(PyExc_ValueError,
13418 "incomplete format");
13419 goto onError;
13420 }
13421 if (c != '%') {
13422 v = getnextarg(args, arglen, &argidx);
13423 if (v == NULL)
13424 goto onError;
13425 }
13426 sign = 0;
13427 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013428 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 switch (c) {
13430
13431 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013432 _PyAccu_Accumulate(&acc, percent);
13433 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013434
13435 case 's':
13436 case 'r':
13437 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013438 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 temp = v;
13440 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013441 }
13442 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 if (c == 's')
13444 temp = PyObject_Str(v);
13445 else if (c == 'r')
13446 temp = PyObject_Repr(v);
13447 else
13448 temp = PyObject_ASCII(v);
13449 if (temp == NULL)
13450 goto onError;
13451 if (PyUnicode_Check(temp))
13452 /* nothing to do */;
13453 else {
13454 Py_DECREF(temp);
13455 PyErr_SetString(PyExc_TypeError,
13456 "%s argument has non-string str()");
13457 goto onError;
13458 }
13459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 if (PyUnicode_READY(temp) == -1) {
13461 Py_CLEAR(temp);
13462 goto onError;
13463 }
13464 pbuf = PyUnicode_DATA(temp);
13465 kind = PyUnicode_KIND(temp);
13466 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 if (prec >= 0 && len > prec)
13468 len = prec;
13469 break;
13470
13471 case 'i':
13472 case 'd':
13473 case 'u':
13474 case 'o':
13475 case 'x':
13476 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013477 isnumok = 0;
13478 if (PyNumber_Check(v)) {
13479 PyObject *iobj=NULL;
13480
13481 if (PyLong_Check(v)) {
13482 iobj = v;
13483 Py_INCREF(iobj);
13484 }
13485 else {
13486 iobj = PyNumber_Long(v);
13487 }
13488 if (iobj!=NULL) {
13489 if (PyLong_Check(iobj)) {
13490 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013491 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 Py_DECREF(iobj);
13493 if (!temp)
13494 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 if (PyUnicode_READY(temp) == -1) {
13496 Py_CLEAR(temp);
13497 goto onError;
13498 }
13499 pbuf = PyUnicode_DATA(temp);
13500 kind = PyUnicode_KIND(temp);
13501 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 sign = 1;
13503 }
13504 else {
13505 Py_DECREF(iobj);
13506 }
13507 }
13508 }
13509 if (!isnumok) {
13510 PyErr_Format(PyExc_TypeError,
13511 "%%%c format: a number is required, "
13512 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13513 goto onError;
13514 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013515 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 fillobj = zero;
13518 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 break;
13520
13521 case 'e':
13522 case 'E':
13523 case 'f':
13524 case 'F':
13525 case 'g':
13526 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013527 temp = formatfloat(v, flags, prec, c);
13528 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 if (PyUnicode_READY(temp) == -1) {
13531 Py_CLEAR(temp);
13532 goto onError;
13533 }
13534 pbuf = PyUnicode_DATA(temp);
13535 kind = PyUnicode_KIND(temp);
13536 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013538 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 fillobj = zero;
13541 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 break;
13543
13544 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013545 {
13546 Py_UCS4 ch = formatchar(v);
13547 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013549 temp = _PyUnicode_FromUCS4(&ch, 1);
13550 if (temp == NULL)
13551 goto onError;
13552 pbuf = PyUnicode_DATA(temp);
13553 kind = PyUnicode_KIND(temp);
13554 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013556 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013557
13558 default:
13559 PyErr_Format(PyExc_ValueError,
13560 "unsupported format character '%c' (0x%x) "
13561 "at index %zd",
13562 (31<=c && c<=126) ? (char)c : '?',
13563 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 goto onError;
13566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 /* pbuf is initialized here. */
13568 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13571 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013573 pindex++;
13574 }
13575 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13576 signobj = plus;
13577 len--;
13578 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 }
13580 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 else
13585 sign = 0;
13586 }
13587 if (width < len)
13588 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013590 if (fill != ' ') {
13591 assert(signobj != NULL);
13592 if (_PyAccu_Accumulate(&acc, signobj))
13593 goto onError;
13594 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 if (width > len)
13596 width--;
13597 }
13598 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013600 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013602 second = get_latin1_char(
13603 PyUnicode_READ(kind, pbuf, pindex + 1));
13604 pindex += 2;
13605 if (second == NULL ||
13606 _PyAccu_Accumulate(&acc, zero) ||
13607 _PyAccu_Accumulate(&acc, second))
13608 goto onError;
13609 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 width -= 2;
13612 if (width < 0)
13613 width = 0;
13614 len -= 2;
13615 }
13616 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013617 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013618 if (repeat_accumulate(&acc, fillobj, width - len))
13619 goto onError;
13620 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 }
13622 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 if (sign) {
13624 assert(signobj != NULL);
13625 if (_PyAccu_Accumulate(&acc, signobj))
13626 goto onError;
13627 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013628 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13630 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 second = get_latin1_char(
13632 PyUnicode_READ(kind, pbuf, pindex + 1));
13633 pindex += 2;
13634 if (second == NULL ||
13635 _PyAccu_Accumulate(&acc, zero) ||
13636 _PyAccu_Accumulate(&acc, second))
13637 goto onError;
13638 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013639 }
13640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 if (temp != NULL) {
13643 assert(pbuf == PyUnicode_DATA(temp));
13644 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013645 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 else {
13647 const char *p = (const char *) pbuf;
13648 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013649 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013650 v = PyUnicode_FromKindAndData(kind, p, len);
13651 }
13652 if (v == NULL)
13653 goto onError;
13654 r = _PyAccu_Accumulate(&acc, v);
13655 Py_DECREF(v);
13656 if (r)
13657 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013658 if (width > len && repeat_accumulate(&acc, blank, width - len))
13659 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 if (dict && (argidx < arglen) && c != '%') {
13661 PyErr_SetString(PyExc_TypeError,
13662 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 goto onError;
13664 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013665 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667 } /* until end */
13668 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 PyErr_SetString(PyExc_TypeError,
13670 "not all arguments converted during string formatting");
13671 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672 }
13673
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013674 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 }
13678 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013679 Py_XDECREF(temp);
13680 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013681 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682
Benjamin Peterson29060642009-01-31 22:14:21 +000013683 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013685 Py_XDECREF(temp);
13686 Py_XDECREF(second);
13687 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690 }
13691 return NULL;
13692}
13693
Jeremy Hylton938ace62002-07-17 16:30:39 +000013694static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013695unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13696
Tim Peters6d6c1a32001-08-02 04:15:00 +000013697static PyObject *
13698unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13699{
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013701 static char *kwlist[] = {"object", "encoding", "errors", 0};
13702 char *encoding = NULL;
13703 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013704
Benjamin Peterson14339b62009-01-31 16:36:08 +000013705 if (type != &PyUnicode_Type)
13706 return unicode_subtype_new(type, args, kwds);
13707 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013709 return NULL;
13710 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013711 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013712 if (encoding == NULL && errors == NULL)
13713 return PyObject_Str(x);
13714 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013716}
13717
Guido van Rossume023fe02001-08-30 03:12:59 +000013718static PyObject *
13719unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13720{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013721 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013722 Py_ssize_t length, char_size;
13723 int share_wstr, share_utf8;
13724 unsigned int kind;
13725 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013726
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013728
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013729 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013730 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013731 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013732 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013733 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013734 return NULL;
13735
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013736 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013737 if (self == NULL) {
13738 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013739 return NULL;
13740 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013741 kind = PyUnicode_KIND(unicode);
13742 length = PyUnicode_GET_LENGTH(unicode);
13743
13744 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013745#ifdef Py_DEBUG
13746 _PyUnicode_HASH(self) = -1;
13747#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013748 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013749#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013750 _PyUnicode_STATE(self).interned = 0;
13751 _PyUnicode_STATE(self).kind = kind;
13752 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013753 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013754 _PyUnicode_STATE(self).ready = 1;
13755 _PyUnicode_WSTR(self) = NULL;
13756 _PyUnicode_UTF8_LENGTH(self) = 0;
13757 _PyUnicode_UTF8(self) = NULL;
13758 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013759 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013760
13761 share_utf8 = 0;
13762 share_wstr = 0;
13763 if (kind == PyUnicode_1BYTE_KIND) {
13764 char_size = 1;
13765 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13766 share_utf8 = 1;
13767 }
13768 else if (kind == PyUnicode_2BYTE_KIND) {
13769 char_size = 2;
13770 if (sizeof(wchar_t) == 2)
13771 share_wstr = 1;
13772 }
13773 else {
13774 assert(kind == PyUnicode_4BYTE_KIND);
13775 char_size = 4;
13776 if (sizeof(wchar_t) == 4)
13777 share_wstr = 1;
13778 }
13779
13780 /* Ensure we won't overflow the length. */
13781 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13782 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013784 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013785 data = PyObject_MALLOC((length + 1) * char_size);
13786 if (data == NULL) {
13787 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 goto onError;
13789 }
13790
Victor Stinnerc3c74152011-10-02 20:39:55 +020013791 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013792 if (share_utf8) {
13793 _PyUnicode_UTF8_LENGTH(self) = length;
13794 _PyUnicode_UTF8(self) = data;
13795 }
13796 if (share_wstr) {
13797 _PyUnicode_WSTR_LENGTH(self) = length;
13798 _PyUnicode_WSTR(self) = (wchar_t *)data;
13799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013801 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013802 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013803 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013804#ifdef Py_DEBUG
13805 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13806#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013807 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013808 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013809
13810onError:
13811 Py_DECREF(unicode);
13812 Py_DECREF(self);
13813 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013814}
13815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013816PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013818\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013819Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013820encoding defaults to the current default string encoding.\n\
13821errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013822
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013823static PyObject *unicode_iter(PyObject *seq);
13824
Guido van Rossumd57fd912000-03-10 22:53:23 +000013825PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013826 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013827 "str", /* tp_name */
13828 sizeof(PyUnicodeObject), /* tp_size */
13829 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013831 (destructor)unicode_dealloc, /* tp_dealloc */
13832 0, /* tp_print */
13833 0, /* tp_getattr */
13834 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013835 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013836 unicode_repr, /* tp_repr */
13837 &unicode_as_number, /* tp_as_number */
13838 &unicode_as_sequence, /* tp_as_sequence */
13839 &unicode_as_mapping, /* tp_as_mapping */
13840 (hashfunc) unicode_hash, /* tp_hash*/
13841 0, /* tp_call*/
13842 (reprfunc) unicode_str, /* tp_str */
13843 PyObject_GenericGetAttr, /* tp_getattro */
13844 0, /* tp_setattro */
13845 0, /* tp_as_buffer */
13846 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013847 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013848 unicode_doc, /* tp_doc */
13849 0, /* tp_traverse */
13850 0, /* tp_clear */
13851 PyUnicode_RichCompare, /* tp_richcompare */
13852 0, /* tp_weaklistoffset */
13853 unicode_iter, /* tp_iter */
13854 0, /* tp_iternext */
13855 unicode_methods, /* tp_methods */
13856 0, /* tp_members */
13857 0, /* tp_getset */
13858 &PyBaseObject_Type, /* tp_base */
13859 0, /* tp_dict */
13860 0, /* tp_descr_get */
13861 0, /* tp_descr_set */
13862 0, /* tp_dictoffset */
13863 0, /* tp_init */
13864 0, /* tp_alloc */
13865 unicode_new, /* tp_new */
13866 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013867};
13868
13869/* Initialize the Unicode implementation */
13870
Victor Stinner3a50e702011-10-18 21:21:00 +020013871int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013873 int i;
13874
Thomas Wouters477c8d52006-05-27 19:21:47 +000013875 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013877 0x000A, /* LINE FEED */
13878 0x000D, /* CARRIAGE RETURN */
13879 0x001C, /* FILE SEPARATOR */
13880 0x001D, /* GROUP SEPARATOR */
13881 0x001E, /* RECORD SEPARATOR */
13882 0x0085, /* NEXT LINE */
13883 0x2028, /* LINE SEPARATOR */
13884 0x2029, /* PARAGRAPH SEPARATOR */
13885 };
13886
Fred Drakee4315f52000-05-09 19:53:39 +000013887 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013888 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013889 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013891 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013892
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013893 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013895 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013897
13898 /* initialize the linebreak bloom filter */
13899 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013900 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013901 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013902
13903 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013904
13905#ifdef HAVE_MBCS
13906 winver.dwOSVersionInfoSize = sizeof(winver);
13907 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13908 PyErr_SetFromWindowsErr(0);
13909 return -1;
13910 }
13911#endif
13912 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913}
13914
13915/* Finalize the Unicode implementation */
13916
Christian Heimesa156e092008-02-16 07:38:31 +000013917int
13918PyUnicode_ClearFreeList(void)
13919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013921}
13922
Guido van Rossumd57fd912000-03-10 22:53:23 +000013923void
Thomas Wouters78890102000-07-22 19:25:51 +000013924_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013926 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013928 Py_XDECREF(unicode_empty);
13929 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013930
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013931 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 if (unicode_latin1[i]) {
13933 Py_DECREF(unicode_latin1[i]);
13934 unicode_latin1[i] = NULL;
13935 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013936 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013937 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013938 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013940
Walter Dörwald16807132007-05-25 13:52:07 +000013941void
13942PyUnicode_InternInPlace(PyObject **p)
13943{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013944 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013945 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013946#ifdef Py_DEBUG
13947 assert(s != NULL);
13948 assert(_PyUnicode_CHECK(s));
13949#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013951 return;
13952#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 /* If it's a subclass, we don't really know what putting
13954 it in the interned dict might do. */
13955 if (!PyUnicode_CheckExact(s))
13956 return;
13957 if (PyUnicode_CHECK_INTERNED(s))
13958 return;
13959 if (interned == NULL) {
13960 interned = PyDict_New();
13961 if (interned == NULL) {
13962 PyErr_Clear(); /* Don't leave an exception */
13963 return;
13964 }
13965 }
13966 /* It might be that the GetItem call fails even
13967 though the key is present in the dictionary,
13968 namely when this happens during a stack overflow. */
13969 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013970 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013972
Benjamin Peterson29060642009-01-31 22:14:21 +000013973 if (t) {
13974 Py_INCREF(t);
13975 Py_DECREF(*p);
13976 *p = t;
13977 return;
13978 }
Walter Dörwald16807132007-05-25 13:52:07 +000013979
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013981 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 PyErr_Clear();
13983 PyThreadState_GET()->recursion_critical = 0;
13984 return;
13985 }
13986 PyThreadState_GET()->recursion_critical = 0;
13987 /* The two references in interned are not counted by refcnt.
13988 The deallocator will take care of this */
13989 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013991}
13992
13993void
13994PyUnicode_InternImmortal(PyObject **p)
13995{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013996 PyUnicode_InternInPlace(p);
13997 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013998 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 Py_INCREF(*p);
14000 }
Walter Dörwald16807132007-05-25 13:52:07 +000014001}
14002
14003PyObject *
14004PyUnicode_InternFromString(const char *cp)
14005{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 PyObject *s = PyUnicode_FromString(cp);
14007 if (s == NULL)
14008 return NULL;
14009 PyUnicode_InternInPlace(&s);
14010 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014011}
14012
Alexander Belopolsky40018472011-02-26 01:02:56 +000014013void
14014_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014015{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014017 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 Py_ssize_t i, n;
14019 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014020
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 if (interned == NULL || !PyDict_Check(interned))
14022 return;
14023 keys = PyDict_Keys(interned);
14024 if (keys == NULL || !PyList_Check(keys)) {
14025 PyErr_Clear();
14026 return;
14027 }
Walter Dörwald16807132007-05-25 13:52:07 +000014028
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14030 detector, interned unicode strings are not forcibly deallocated;
14031 rather, we give them their stolen references back, and then clear
14032 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014033
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 n = PyList_GET_SIZE(keys);
14035 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014036 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014038 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014039 if (PyUnicode_READY(s) == -1) {
14040 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014041 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014043 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 case SSTATE_NOT_INTERNED:
14045 /* XXX Shouldn't happen */
14046 break;
14047 case SSTATE_INTERNED_IMMORTAL:
14048 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014049 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 break;
14051 case SSTATE_INTERNED_MORTAL:
14052 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 break;
14055 default:
14056 Py_FatalError("Inconsistent interned string state.");
14057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014058 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 }
14060 fprintf(stderr, "total size of all interned strings: "
14061 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14062 "mortal/immortal\n", mortal_size, immortal_size);
14063 Py_DECREF(keys);
14064 PyDict_Clear(interned);
14065 Py_DECREF(interned);
14066 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014067}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014068
14069
14070/********************* Unicode Iterator **************************/
14071
14072typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 PyObject_HEAD
14074 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014075 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014076} unicodeiterobject;
14077
14078static void
14079unicodeiter_dealloc(unicodeiterobject *it)
14080{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 _PyObject_GC_UNTRACK(it);
14082 Py_XDECREF(it->it_seq);
14083 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014084}
14085
14086static int
14087unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014089 Py_VISIT(it->it_seq);
14090 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014091}
14092
14093static PyObject *
14094unicodeiter_next(unicodeiterobject *it)
14095{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014096 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014097
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 assert(it != NULL);
14099 seq = it->it_seq;
14100 if (seq == NULL)
14101 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014102 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014104 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14105 int kind = PyUnicode_KIND(seq);
14106 void *data = PyUnicode_DATA(seq);
14107 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14108 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 if (item != NULL)
14110 ++it->it_index;
14111 return item;
14112 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014113
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 Py_DECREF(seq);
14115 it->it_seq = NULL;
14116 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014117}
14118
14119static PyObject *
14120unicodeiter_len(unicodeiterobject *it)
14121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 Py_ssize_t len = 0;
14123 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014124 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014125 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014126}
14127
14128PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14129
14130static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014133 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014134};
14135
14136PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14138 "str_iterator", /* tp_name */
14139 sizeof(unicodeiterobject), /* tp_basicsize */
14140 0, /* tp_itemsize */
14141 /* methods */
14142 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14143 0, /* tp_print */
14144 0, /* tp_getattr */
14145 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014146 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 0, /* tp_repr */
14148 0, /* tp_as_number */
14149 0, /* tp_as_sequence */
14150 0, /* tp_as_mapping */
14151 0, /* tp_hash */
14152 0, /* tp_call */
14153 0, /* tp_str */
14154 PyObject_GenericGetAttr, /* tp_getattro */
14155 0, /* tp_setattro */
14156 0, /* tp_as_buffer */
14157 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14158 0, /* tp_doc */
14159 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14160 0, /* tp_clear */
14161 0, /* tp_richcompare */
14162 0, /* tp_weaklistoffset */
14163 PyObject_SelfIter, /* tp_iter */
14164 (iternextfunc)unicodeiter_next, /* tp_iternext */
14165 unicodeiter_methods, /* tp_methods */
14166 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014167};
14168
14169static PyObject *
14170unicode_iter(PyObject *seq)
14171{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014173
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 if (!PyUnicode_Check(seq)) {
14175 PyErr_BadInternalCall();
14176 return NULL;
14177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 if (PyUnicode_READY(seq) == -1)
14179 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14181 if (it == NULL)
14182 return NULL;
14183 it->it_index = 0;
14184 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014185 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 _PyObject_GC_TRACK(it);
14187 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014188}
14189
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014190
14191size_t
14192Py_UNICODE_strlen(const Py_UNICODE *u)
14193{
14194 int res = 0;
14195 while(*u++)
14196 res++;
14197 return res;
14198}
14199
14200Py_UNICODE*
14201Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14202{
14203 Py_UNICODE *u = s1;
14204 while ((*u++ = *s2++));
14205 return s1;
14206}
14207
14208Py_UNICODE*
14209Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14210{
14211 Py_UNICODE *u = s1;
14212 while ((*u++ = *s2++))
14213 if (n-- == 0)
14214 break;
14215 return s1;
14216}
14217
14218Py_UNICODE*
14219Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14220{
14221 Py_UNICODE *u1 = s1;
14222 u1 += Py_UNICODE_strlen(u1);
14223 Py_UNICODE_strcpy(u1, s2);
14224 return s1;
14225}
14226
14227int
14228Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14229{
14230 while (*s1 && *s2 && *s1 == *s2)
14231 s1++, s2++;
14232 if (*s1 && *s2)
14233 return (*s1 < *s2) ? -1 : +1;
14234 if (*s1)
14235 return 1;
14236 if (*s2)
14237 return -1;
14238 return 0;
14239}
14240
14241int
14242Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14243{
14244 register Py_UNICODE u1, u2;
14245 for (; n != 0; n--) {
14246 u1 = *s1;
14247 u2 = *s2;
14248 if (u1 != u2)
14249 return (u1 < u2) ? -1 : +1;
14250 if (u1 == '\0')
14251 return 0;
14252 s1++;
14253 s2++;
14254 }
14255 return 0;
14256}
14257
14258Py_UNICODE*
14259Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14260{
14261 const Py_UNICODE *p;
14262 for (p = s; *p; p++)
14263 if (*p == c)
14264 return (Py_UNICODE*)p;
14265 return NULL;
14266}
14267
14268Py_UNICODE*
14269Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14270{
14271 const Py_UNICODE *p;
14272 p = s + Py_UNICODE_strlen(s);
14273 while (p != s) {
14274 p--;
14275 if (*p == c)
14276 return (Py_UNICODE*)p;
14277 }
14278 return NULL;
14279}
Victor Stinner331ea922010-08-10 16:37:20 +000014280
Victor Stinner71133ff2010-09-01 23:43:53 +000014281Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014282PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014283{
Victor Stinner577db2c2011-10-11 22:12:48 +020014284 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014285 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014287 if (!PyUnicode_Check(unicode)) {
14288 PyErr_BadArgument();
14289 return NULL;
14290 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014291 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014292 if (u == NULL)
14293 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014294 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014295 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014296 PyErr_NoMemory();
14297 return NULL;
14298 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014299 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014300 size *= sizeof(Py_UNICODE);
14301 copy = PyMem_Malloc(size);
14302 if (copy == NULL) {
14303 PyErr_NoMemory();
14304 return NULL;
14305 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014306 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014307 return copy;
14308}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014309
Georg Brandl66c221e2010-10-14 07:04:07 +000014310/* A _string module, to export formatter_parser and formatter_field_name_split
14311 to the string.Formatter class implemented in Python. */
14312
14313static PyMethodDef _string_methods[] = {
14314 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14315 METH_O, PyDoc_STR("split the argument as a field name")},
14316 {"formatter_parser", (PyCFunction) formatter_parser,
14317 METH_O, PyDoc_STR("parse the argument as a format string")},
14318 {NULL, NULL}
14319};
14320
14321static struct PyModuleDef _string_module = {
14322 PyModuleDef_HEAD_INIT,
14323 "_string",
14324 PyDoc_STR("string helper module"),
14325 0,
14326 _string_methods,
14327 NULL,
14328 NULL,
14329 NULL,
14330 NULL
14331};
14332
14333PyMODINIT_FUNC
14334PyInit__string(void)
14335{
14336 return PyModule_Create(&_string_module);
14337}
14338
14339
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014340#ifdef __cplusplus
14341}
14342#endif