blob: 16db801b5c3a8a5fcc8618d6828c798b7709d294 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100382 if (maxchar > 0x10FFFF) {
383 printf("Invalid Unicode string! {");
384 for (i=0; i < ascii->length; i++)
385 {
386 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
387 if (i)
388 printf(", U+%04x", ch);
389 else
390 printf("U+%04x", ch);
391 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100392 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100393 abort();
394 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100396 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 255);
399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 else
401 assert(maxchar < 128);
402 }
Victor Stinner77faf692011-11-20 18:56:05 +0100403 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 0xFFFF);
406 }
407 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0x10FFFF);
410 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400412 return 1;
413}
Victor Stinner910337b2011-10-03 03:20:16 +0200414#endif
415
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100416static PyObject*
417unicode_result_wchar(PyObject *unicode)
418{
419#ifndef Py_DEBUG
420 Py_ssize_t len;
421
422 assert(Py_REFCNT(unicode) == 1);
423
424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
426 Py_INCREF(unicode_empty);
427 Py_DECREF(unicode);
428 return unicode_empty;
429 }
430
431 if (len == 1) {
432 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
433 if (ch < 256) {
434 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
435 Py_DECREF(unicode);
436 return latin1_char;
437 }
438 }
439
440 if (_PyUnicode_Ready(unicode) < 0) {
441 Py_XDECREF(unicode);
442 return NULL;
443 }
444#else
445 /* don't make the result ready in debug mode to ensure that the caller
446 makes the string ready before using it */
447 assert(_PyUnicode_CheckConsistency(unicode, 1));
448#endif
449 return unicode;
450}
451
452static PyObject*
453unicode_result_ready(PyObject *unicode)
454{
455 Py_ssize_t length;
456
457 length = PyUnicode_GET_LENGTH(unicode);
458 if (length == 0) {
459 if (unicode != unicode_empty) {
460 Py_INCREF(unicode_empty);
461 Py_DECREF(unicode);
462 }
463 return unicode_empty;
464 }
465
466 if (length == 1) {
467 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
468 if (ch < 256) {
469 PyObject *latin1_char = unicode_latin1[ch];
470 if (latin1_char != NULL) {
471 if (unicode != latin1_char) {
472 Py_INCREF(latin1_char);
473 Py_DECREF(unicode);
474 }
475 return latin1_char;
476 }
477 else {
478 assert(_PyUnicode_CheckConsistency(unicode, 1));
479 Py_INCREF(unicode);
480 unicode_latin1[ch] = unicode;
481 return unicode;
482 }
483 }
484 }
485
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 return unicode;
488}
489
490static PyObject*
491unicode_result(PyObject *unicode)
492{
493 assert(_PyUnicode_CHECK(unicode));
494 if (PyUnicode_IS_READY(unicode))
495 return unicode_result_ready(unicode);
496 else
497 return unicode_result_wchar(unicode);
498}
499
Victor Stinner3a50e702011-10-18 21:21:00 +0200500#ifdef HAVE_MBCS
501static OSVERSIONINFOEX winver;
502#endif
503
Thomas Wouters477c8d52006-05-27 19:21:47 +0000504/* --- Bloom Filters ----------------------------------------------------- */
505
506/* stuff to implement simple "bloom filters" for Unicode characters.
507 to keep things simple, we use a single bitmask, using the least 5
508 bits from each unicode characters as the bit index. */
509
510/* the linebreak mask is set up by Unicode_Init below */
511
Antoine Pitrouf068f942010-01-13 14:19:12 +0000512#if LONG_BIT >= 128
513#define BLOOM_WIDTH 128
514#elif LONG_BIT >= 64
515#define BLOOM_WIDTH 64
516#elif LONG_BIT >= 32
517#define BLOOM_WIDTH 32
518#else
519#error "LONG_BIT is smaller than 32"
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522#define BLOOM_MASK unsigned long
523
524static BLOOM_MASK bloom_linebreak;
525
Antoine Pitrouf068f942010-01-13 14:19:12 +0000526#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529#define BLOOM_LINEBREAK(ch) \
530 ((ch) < 128U ? ascii_linebreak[(ch)] : \
531 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Alexander Belopolsky40018472011-02-26 01:02:56 +0000533Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535{
536 /* calculate simple bloom-style bitmask for a given unicode string */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539 Py_ssize_t i;
540
541 mask = 0;
542 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
545 return mask;
546}
547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548#define BLOOM_MEMBER(mask, chr, str) \
549 (BLOOM(mask, chr) \
550 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200552/* Compilation of templated routines */
553
554#include "stringlib/asciilib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs1lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs2lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
584#include "stringlib/ucs4lib.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/partition.h"
587#include "stringlib/split.h"
588#include "stringlib/count.h"
589#include "stringlib/find.h"
590#include "stringlib/find_max_char.h"
591#include "stringlib/localeutil.h"
592#include "stringlib/undef.h"
593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594#include "stringlib/unicodedefs.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100598#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600/* --- Unicode Object ----------------------------------------------------- */
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200603fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200605Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
606 Py_ssize_t size, Py_UCS4 ch,
607 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
610
611 switch (kind) {
612 case PyUnicode_1BYTE_KIND:
613 {
614 Py_UCS1 ch1 = (Py_UCS1) ch;
615 if (ch1 == ch)
616 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
617 else
618 return -1;
619 }
620 case PyUnicode_2BYTE_KIND:
621 {
622 Py_UCS2 ch2 = (Py_UCS2) ch;
623 if (ch2 == ch)
624 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_4BYTE_KIND:
629 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
630 default:
631 assert(0);
632 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634}
635
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636static PyObject*
637resize_compact(PyObject *unicode, Py_ssize_t length)
638{
639 Py_ssize_t char_size;
640 Py_ssize_t struct_size;
641 Py_ssize_t new_size;
642 int share_wstr;
643
644 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200645 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 if (PyUnicode_IS_COMPACT_ASCII(unicode))
647 struct_size = sizeof(PyASCIIObject);
648 else
649 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200650 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200651
652 _Py_DEC_REFTOTAL;
653 _Py_ForgetReference(unicode);
654
655 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
656 PyErr_NoMemory();
657 return NULL;
658 }
659 new_size = (struct_size + (length + 1) * char_size);
660
661 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
662 if (unicode == NULL) {
663 PyObject_Del(unicode);
664 PyErr_NoMemory();
665 return NULL;
666 }
667 _Py_NewReference(unicode);
668 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200669 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200671 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
672 _PyUnicode_WSTR_LENGTH(unicode) = length;
673 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
675 length, 0);
676 return unicode;
677}
678
Alexander Belopolsky40018472011-02-26 01:02:56 +0000679static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681{
Victor Stinner95663112011-10-04 01:03:50 +0200682 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000685
Victor Stinner95663112011-10-04 01:03:50 +0200686 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687
688 if (PyUnicode_IS_READY(unicode)) {
689 Py_ssize_t char_size;
690 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200691 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 void *data;
693
694 data = _PyUnicode_DATA_ANY(unicode);
695 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200696 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200697 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
698 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200699 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
700 {
701 PyObject_DEL(_PyUnicode_UTF8(unicode));
702 _PyUnicode_UTF8(unicode) = NULL;
703 _PyUnicode_UTF8_LENGTH(unicode) = 0;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705
706 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
707 PyErr_NoMemory();
708 return -1;
709 }
710 new_size = (length + 1) * char_size;
711
712 data = (PyObject *)PyObject_REALLOC(data, new_size);
713 if (data == NULL) {
714 PyErr_NoMemory();
715 return -1;
716 }
717 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200720 _PyUnicode_WSTR_LENGTH(unicode) = length;
721 }
722 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200723 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 _PyUnicode_UTF8_LENGTH(unicode) = length;
725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 _PyUnicode_LENGTH(unicode) = length;
727 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200728 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200729 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 }
Victor Stinner95663112011-10-04 01:03:50 +0200733 assert(_PyUnicode_WSTR(unicode) != NULL);
734
735 /* check for integer overflow */
736 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 wstr = _PyUnicode_WSTR(unicode);
741 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
742 if (!wstr) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_WSTR(unicode) = wstr;
747 _PyUnicode_WSTR(unicode)[length] = 0;
748 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000750 return 0;
751}
752
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753static PyObject*
754resize_copy(PyObject *unicode, Py_ssize_t length)
755{
756 Py_ssize_t copy_length;
757 if (PyUnicode_IS_COMPACT(unicode)) {
758 PyObject *copy;
759 assert(PyUnicode_IS_READY(unicode));
760
761 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
762 if (copy == NULL)
763 return NULL;
764
765 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200766 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200768 }
769 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200770 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 assert(_PyUnicode_WSTR(unicode) != NULL);
772 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200773 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 if (w == NULL)
775 return NULL;
776 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
777 copy_length = Py_MIN(copy_length, length);
778 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
779 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 }
782}
783
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000785 Ux0000 terminated; some code (e.g. new_identifier)
786 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787
788 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000789 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790
791*/
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200794static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
Alexander Belopolsky40018472011-02-26 01:02:56 +0000797static PyUnicodeObject *
798_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799{
800 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 if (length == 0 && unicode_empty != NULL) {
805 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200806 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000809 /* Ensure we won't overflow the size. */
810 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
811 return (PyUnicodeObject *)PyErr_NoMemory();
812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 if (length < 0) {
814 PyErr_SetString(PyExc_SystemError,
815 "Negative size passed to _PyUnicode_New");
816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 }
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819#ifdef Py_DEBUG
820 ++unicode_old_new_calls;
821#endif
822
823 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
824 if (unicode == NULL)
825 return NULL;
826 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
827 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
828 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000829 PyErr_NoMemory();
830 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832
Jeremy Hyltond8082792003-09-16 19:41:39 +0000833 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000834 * the caller fails before initializing str -- unicode_resize()
835 * reads str[0], and the Keep-Alive optimization can keep memory
836 * allocated for str alive across a call to unicode_dealloc(unicode).
837 * We don't want unicode_resize to read uninitialized memory in
838 * that case.
839 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 _PyUnicode_WSTR(unicode)[0] = 0;
841 _PyUnicode_WSTR(unicode)[length] = 0;
842 _PyUnicode_WSTR_LENGTH(unicode) = length;
843 _PyUnicode_HASH(unicode) = -1;
844 _PyUnicode_STATE(unicode).interned = 0;
845 _PyUnicode_STATE(unicode).kind = 0;
846 _PyUnicode_STATE(unicode).compact = 0;
847 _PyUnicode_STATE(unicode).ready = 0;
848 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200849 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200851 _PyUnicode_UTF8(unicode) = NULL;
852 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100853 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855
Benjamin Peterson29060642009-01-31 22:14:21 +0000856 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000857 /* XXX UNREF/NEWREF interface should be more symmetrical */
858 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000859 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000860 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862}
863
Victor Stinnerf42dc442011-10-02 23:33:16 +0200864static const char*
865unicode_kind_name(PyObject *unicode)
866{
Victor Stinner42dfd712011-10-03 14:41:45 +0200867 /* don't check consistency: unicode_kind_name() is called from
868 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200869 if (!PyUnicode_IS_COMPACT(unicode))
870 {
871 if (!PyUnicode_IS_READY(unicode))
872 return "wstr";
873 switch(PyUnicode_KIND(unicode))
874 {
875 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200876 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 return "legacy ascii";
878 else
879 return "legacy latin1";
880 case PyUnicode_2BYTE_KIND:
881 return "legacy UCS2";
882 case PyUnicode_4BYTE_KIND:
883 return "legacy UCS4";
884 default:
885 return "<legacy invalid kind>";
886 }
887 }
888 assert(PyUnicode_IS_READY(unicode));
889 switch(PyUnicode_KIND(unicode))
890 {
891 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200892 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200893 return "ascii";
894 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 default:
901 return "<invalid compact kind>";
902 }
903}
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200906static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907
908/* Functions wrapping macros for use in debugger */
909char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911}
912
913void *_PyUnicode_compact_data(void *unicode) {
914 return _PyUnicode_COMPACT_DATA(unicode);
915}
916void *_PyUnicode_data(void *unicode){
917 printf("obj %p\n", unicode);
918 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
919 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
920 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
921 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
922 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
923 return PyUnicode_DATA(unicode);
924}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200925
926void
927_PyUnicode_Dump(PyObject *op)
928{
929 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
931 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
932 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200935 {
936 if (ascii->state.ascii)
937 data = (ascii + 1);
938 else
939 data = (compact + 1);
940 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 else
942 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
944
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 if (ascii->wstr == data)
946 printf("shared ");
947 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 printf(" (%zu), ", compact->wstr_length);
951 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
952 printf("shared ");
953 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200956}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957#endif
958
959PyObject *
960PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
961{
962 PyObject *obj;
963 PyCompactUnicodeObject *unicode;
964 void *data;
965 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200966 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Py_ssize_t char_size;
968 Py_ssize_t struct_size;
969
970 /* Optimization for empty strings */
971 if (size == 0 && unicode_empty != NULL) {
972 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200973 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 }
975
976#ifdef Py_DEBUG
977 ++unicode_new_new_calls;
978#endif
979
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 is_ascii = 0;
981 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 struct_size = sizeof(PyCompactUnicodeObject);
983 if (maxchar < 128) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 is_ascii = 1;
987 struct_size = sizeof(PyASCIIObject);
988 }
989 else if (maxchar < 256) {
990 kind_state = PyUnicode_1BYTE_KIND;
991 char_size = 1;
992 }
993 else if (maxchar < 65536) {
994 kind_state = PyUnicode_2BYTE_KIND;
995 char_size = 2;
996 if (sizeof(wchar_t) == 2)
997 is_sharing = 1;
998 }
999 else {
1000 kind_state = PyUnicode_4BYTE_KIND;
1001 char_size = 4;
1002 if (sizeof(wchar_t) == 4)
1003 is_sharing = 1;
1004 }
1005
1006 /* Ensure we won't overflow the size. */
1007 if (size < 0) {
1008 PyErr_SetString(PyExc_SystemError,
1009 "Negative size passed to PyUnicode_New");
1010 return NULL;
1011 }
1012 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1013 return PyErr_NoMemory();
1014
1015 /* Duplicated allocation code from _PyObject_New() instead of a call to
1016 * PyObject_New() so we are able to allocate space for the object and
1017 * it's data buffer.
1018 */
1019 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1020 if (obj == NULL)
1021 return PyErr_NoMemory();
1022 obj = PyObject_INIT(obj, &PyUnicode_Type);
1023 if (obj == NULL)
1024 return NULL;
1025
1026 unicode = (PyCompactUnicodeObject *)obj;
1027 if (is_ascii)
1028 data = ((PyASCIIObject*)obj) + 1;
1029 else
1030 data = unicode + 1;
1031 _PyUnicode_LENGTH(unicode) = size;
1032 _PyUnicode_HASH(unicode) = -1;
1033 _PyUnicode_STATE(unicode).interned = 0;
1034 _PyUnicode_STATE(unicode).kind = kind_state;
1035 _PyUnicode_STATE(unicode).compact = 1;
1036 _PyUnicode_STATE(unicode).ready = 1;
1037 _PyUnicode_STATE(unicode).ascii = is_ascii;
1038 if (is_ascii) {
1039 ((char*)data)[size] = 0;
1040 _PyUnicode_WSTR(unicode) = NULL;
1041 }
1042 else if (kind_state == PyUnicode_1BYTE_KIND) {
1043 ((char*)data)[size] = 0;
1044 _PyUnicode_WSTR(unicode) = NULL;
1045 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 }
1049 else {
1050 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001051 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 if (kind_state == PyUnicode_2BYTE_KIND)
1053 ((Py_UCS2*)data)[size] = 0;
1054 else /* kind_state == PyUnicode_4BYTE_KIND */
1055 ((Py_UCS4*)data)[size] = 0;
1056 if (is_sharing) {
1057 _PyUnicode_WSTR_LENGTH(unicode) = size;
1058 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1059 }
1060 else {
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1062 _PyUnicode_WSTR(unicode) = NULL;
1063 }
1064 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001065 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 return obj;
1067}
1068
1069#if SIZEOF_WCHAR_T == 2
1070/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1071 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001072 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073
1074 This function assumes that unicode can hold one more code point than wstr
1075 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001076static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079{
1080 const wchar_t *iter;
1081 Py_UCS4 *ucs4_out;
1082
Victor Stinner910337b2011-10-03 03:20:16 +02001083 assert(unicode != NULL);
1084 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1086 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1087
1088 for (iter = begin; iter < end; ) {
1089 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1090 _PyUnicode_GET_LENGTH(unicode)));
1091 if (*iter >= 0xD800 && *iter <= 0xDBFF
1092 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1093 {
1094 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1095 iter += 2;
1096 }
1097 else {
1098 *ucs4_out++ = *iter;
1099 iter++;
1100 }
1101 }
1102 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1103 _PyUnicode_GET_LENGTH(unicode)));
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105}
1106#endif
1107
Victor Stinnercd9950f2011-10-02 00:34:53 +02001108static int
1109_PyUnicode_Dirty(PyObject *unicode)
1110{
Victor Stinner910337b2011-10-03 03:20:16 +02001111 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001113 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001114 "Cannot modify a string having more than 1 reference");
1115 return -1;
1116 }
1117 _PyUnicode_DIRTY(unicode);
1118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
1283 if (_PyUnicode_Dirty(to))
1284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
1308
Victor Stinnerc53be962011-10-02 21:33:54 +02001309 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 *num_surrogates = 0;
1311 *maxchar = 0;
1312
1313 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001314 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001316#if SIZEOF_WCHAR_T != 2
1317 if (*maxchar >= 0x10000)
1318 return 0;
1319#endif
1320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321#if SIZEOF_WCHAR_T == 2
1322 if (*iter >= 0xD800 && *iter <= 0xDBFF
1323 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1324 {
1325 Py_UCS4 surrogate_val;
1326 surrogate_val = (((iter[0] & 0x3FF)<<10)
1327 | (iter[1] & 0x3FF)) + 0x10000;
1328 ++(*num_surrogates);
1329 if (surrogate_val > *maxchar)
1330 *maxchar = surrogate_val;
1331 iter += 2;
1332 }
1333 else
1334 iter++;
1335#else
1336 iter++;
1337#endif
1338 }
1339 return 0;
1340}
1341
1342#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001343static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344#endif
1345
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001346int
1347_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348{
1349 wchar_t *end;
1350 Py_UCS4 maxchar = 0;
1351 Py_ssize_t num_surrogates;
1352#if SIZEOF_WCHAR_T == 2
1353 Py_ssize_t length_wo_surrogates;
1354#endif
1355
Georg Brandl7597add2011-10-05 16:36:47 +02001356 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001357 strings were created using _PyObject_New() and where no canonical
1358 representation (the str field) has been set yet aka strings
1359 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001360 assert(_PyUnicode_CHECK(unicode));
1361 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001364 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001365 /* Actually, it should neither be interned nor be anything else: */
1366 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367
1368#ifdef Py_DEBUG
1369 ++unicode_ready_calls;
1370#endif
1371
1372 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001373 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376
1377 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001378 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1379 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyErr_NoMemory();
1381 return -1;
1382 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_1BYTE_DATA(unicode));
1386 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1389 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001390 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001391 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 }
1394 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001395 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001396 _PyUnicode_UTF8(unicode) = NULL;
1397 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 }
1399 PyObject_FREE(_PyUnicode_WSTR(unicode));
1400 _PyUnicode_WSTR(unicode) = NULL;
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 }
1403 /* In this case we might have to convert down from 4-byte native
1404 wchar_t to 2-byte unicode. */
1405 else if (maxchar < 65536) {
1406 assert(num_surrogates == 0 &&
1407 "FindMaxCharAndNumSurrogatePairs() messed up");
1408
Victor Stinner506f5922011-09-28 22:34:18 +02001409#if SIZEOF_WCHAR_T == 2
1410 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001412 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1413 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001415 _PyUnicode_UTF8(unicode) = NULL;
1416 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001417#else
1418 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001420 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyErr_NoMemory();
1423 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
Victor Stinner506f5922011-09-28 22:34:18 +02001425 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1426 _PyUnicode_WSTR(unicode), end,
1427 PyUnicode_2BYTE_DATA(unicode));
1428 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1429 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1430 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8(unicode) = NULL;
1432 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyObject_FREE(_PyUnicode_WSTR(unicode));
1434 _PyUnicode_WSTR(unicode) = NULL;
1435 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1436#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1439 else {
1440#if SIZEOF_WCHAR_T == 2
1441 /* in case the native representation is 2-bytes, we need to allocate a
1442 new normalized 4-byte version. */
1443 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1445 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyErr_NoMemory();
1447 return -1;
1448 }
1449 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1450 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001451 _PyUnicode_UTF8(unicode) = NULL;
1452 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001453 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1454 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001455 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_FREE(_PyUnicode_WSTR(unicode));
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1459#else
1460 assert(num_surrogates == 0);
1461
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1467#endif
1468 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1469 }
1470 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001471 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return 0;
1473}
1474
Alexander Belopolsky40018472011-02-26 01:02:56 +00001475static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477{
Walter Dörwald16807132007-05-25 13:52:07 +00001478 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_NOT_INTERNED:
1480 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 case SSTATE_INTERNED_MORTAL:
1483 /* revive dead object temporarily for DelItem */
1484 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001485 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 Py_FatalError(
1487 "deletion of interned string failed");
1488 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 case SSTATE_INTERNED_IMMORTAL:
1491 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001492
Benjamin Peterson29060642009-01-31 22:14:21 +00001493 default:
1494 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001495 }
1496
Victor Stinner03490912011-10-03 23:45:12 +02001497 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001499 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501
1502 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 }
1505 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 if (_PyUnicode_DATA_ANY(unicode))
1507 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001508 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
1510}
1511
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512#ifdef Py_DEBUG
1513static int
1514unicode_is_singleton(PyObject *unicode)
1515{
1516 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1517 if (unicode == unicode_empty)
1518 return 1;
1519 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1520 {
1521 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1522 if (ch < 256 && unicode_latin1[ch] == unicode)
1523 return 1;
1524 }
1525 return 0;
1526}
1527#endif
1528
Alexander Belopolsky40018472011-02-26 01:02:56 +00001529static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (Py_REFCNT(unicode) != 1)
1533 return 0;
1534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001536#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 /* singleton refcount is greater than 1 */
1538 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001539#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 return 1;
1541}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001542
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543static int
1544unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1545{
1546 PyObject *unicode;
1547 Py_ssize_t old_length;
1548
1549 assert(p_unicode != NULL);
1550 unicode = *p_unicode;
1551
1552 assert(unicode != NULL);
1553 assert(PyUnicode_Check(unicode));
1554 assert(0 <= length);
1555
Victor Stinner910337b2011-10-03 03:20:16 +02001556 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 old_length = PyUnicode_WSTR_LENGTH(unicode);
1558 else
1559 old_length = PyUnicode_GET_LENGTH(unicode);
1560 if (old_length == length)
1561 return 0;
1562
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001563 if (length == 0) {
1564 Py_DECREF(*p_unicode);
1565 *p_unicode = unicode_empty;
1566 Py_INCREF(*p_unicode);
1567 return 0;
1568 }
1569
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 if (!unicode_resizable(unicode)) {
1571 PyObject *copy = resize_copy(unicode, length);
1572 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 Py_DECREF(*p_unicode);
1575 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577 }
1578
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 if (PyUnicode_IS_COMPACT(unicode)) {
1580 *p_unicode = resize_compact(unicode, length);
1581 if (*p_unicode == NULL)
1582 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001585 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001586 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587}
1588
Alexander Belopolsky40018472011-02-26 01:02:56 +00001589int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001590PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001591{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *unicode;
1593 if (p_unicode == NULL) {
1594 PyErr_BadInternalCall();
1595 return -1;
1596 }
1597 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 {
1600 PyErr_BadInternalCall();
1601 return -1;
1602 }
1603 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001604}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001607unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608{
1609 PyObject *result;
1610 assert(PyUnicode_IS_READY(*p_unicode));
1611 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1612 return 0;
1613 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1614 maxchar);
1615 if (result == NULL)
1616 return -1;
1617 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1618 PyUnicode_GET_LENGTH(*p_unicode));
1619 Py_DECREF(*p_unicode);
1620 *p_unicode = result;
1621 return 0;
1622}
1623
1624static int
1625unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1626 Py_UCS4 ch)
1627{
1628 if (unicode_widen(p_unicode, ch) < 0)
1629 return -1;
1630 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1631 PyUnicode_DATA(*p_unicode),
1632 (*pos)++, ch);
1633 return 0;
1634}
1635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636static PyObject*
1637get_latin1_char(unsigned char ch)
1638{
Victor Stinnera464fc12011-10-02 20:39:30 +02001639 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode)
1643 return NULL;
1644 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001645 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 unicode_latin1[ch] = unicode;
1647 }
1648 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650}
1651
Alexander Belopolsky40018472011-02-26 01:02:56 +00001652PyObject *
1653PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001655 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658
1659 if (u == NULL)
1660 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001662 /* If the Unicode data is known at construction time, we can apply
1663 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 /* Optimization for empty strings */
1666 if (size == 0 && unicode_empty != NULL) {
1667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001668 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001669 }
Tim Petersced69f82003-09-16 20:30:58 +00001670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 /* Single character Unicode objects in the Latin-1 range are
1672 shared when using this constructor */
1673 if (size == 1 && *u < 256)
1674 return get_latin1_char((unsigned char)*u);
1675
1676 /* If not empty and not single character, copy the Unicode data
1677 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001678 if (find_maxchar_surrogates(u, u + size,
1679 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 return NULL;
1681
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if (!unicode)
1685 return NULL;
1686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 switch (PyUnicode_KIND(unicode)) {
1688 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001689 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1691 break;
1692 case PyUnicode_2BYTE_KIND:
1693#if Py_UNICODE_SIZE == 2
1694 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1695#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001696 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1698#endif
1699 break;
1700 case PyUnicode_4BYTE_KIND:
1701#if SIZEOF_WCHAR_T == 2
1702 /* This is the only case which has to process surrogates, thus
1703 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001704 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705#else
1706 assert(num_surrogates == 0);
1707 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1708#endif
1709 break;
1710 default:
1711 assert(0 && "Impossible state");
1712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001714 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001720 if (size < 0) {
1721 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 return NULL;
1724 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001725
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001726 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001727 some optimizations which share commonly used objects.
1728 Also, this means the input must be UTF-8, so fall back to the
1729 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730 if (u != NULL) {
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 /* Optimization for empty strings */
1733 if (size == 0 && unicode_empty != NULL) {
1734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001737
1738 /* Single characters are shared when using this constructor.
1739 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001740 if (size == 1 && (unsigned char)*u < 128)
1741 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001742
1743 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001744 }
1745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001746 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001747}
1748
Alexander Belopolsky40018472011-02-26 01:02:56 +00001749PyObject *
1750PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001751{
1752 size_t size = strlen(u);
1753 if (size > PY_SSIZE_T_MAX) {
1754 PyErr_SetString(PyExc_OverflowError, "input too long");
1755 return NULL;
1756 }
1757
1758 return PyUnicode_FromStringAndSize(u, size);
1759}
1760
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001761PyObject *
1762_PyUnicode_FromId(_Py_Identifier *id)
1763{
1764 if (!id->object) {
1765 id->object = PyUnicode_FromString(id->string);
1766 if (!id->object)
1767 return NULL;
1768 PyUnicode_InternInPlace(&id->object);
1769 assert(!id->next);
1770 id->next = static_strings;
1771 static_strings = id;
1772 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001773 return id->object;
1774}
1775
1776void
1777_PyUnicode_ClearStaticStrings()
1778{
1779 _Py_Identifier *i;
1780 for (i = static_strings; i; i = i->next) {
1781 Py_DECREF(i->object);
1782 i->object = NULL;
1783 i->next = NULL;
1784 }
1785}
1786
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001787/* Internal function, don't check maximum character */
1788
Victor Stinnere57b1c02011-09-28 22:20:48 +02001789static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001790unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001791{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001792 PyObject *res;
1793#ifdef Py_DEBUG
1794 const unsigned char *p;
1795 const unsigned char *end = s + size;
1796 for (p=s; p < end; p++) {
1797 assert(*p < 128);
1798 }
1799#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001800 if (size == 1)
1801 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001802 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001803 if (!res)
1804 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001805 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001806 return res;
1807}
1808
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001809static Py_UCS4
1810kind_maxchar_limit(unsigned int kind)
1811{
1812 switch(kind) {
1813 case PyUnicode_1BYTE_KIND:
1814 return 0x80;
1815 case PyUnicode_2BYTE_KIND:
1816 return 0x100;
1817 case PyUnicode_4BYTE_KIND:
1818 return 0x10000;
1819 default:
1820 assert(0 && "invalid kind");
1821 return 0x10ffff;
1822 }
1823}
1824
Victor Stinner702c7342011-10-05 13:50:52 +02001825static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001831 if (size == 0) {
1832 Py_INCREF(unicode_empty);
1833 return unicode_empty;
1834 }
1835 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001836 if (size == 1)
1837 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001839 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 if (!res)
1842 return NULL;
1843 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001846}
1847
Victor Stinnere57b1c02011-09-28 22:20:48 +02001848static PyObject*
1849_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850{
1851 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001854 if (size == 0) {
1855 Py_INCREF(unicode_empty);
1856 return unicode_empty;
1857 }
1858 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001859 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001860 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001861
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 if (!res)
1865 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001866 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001868 else {
1869 _PyUnicode_CONVERT_BYTES(
1870 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1871 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001872 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 return res;
1874}
1875
Victor Stinnere57b1c02011-09-28 22:20:48 +02001876static PyObject*
1877_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878{
1879 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001882 if (size == 0) {
1883 Py_INCREF(unicode_empty);
1884 return unicode_empty;
1885 }
1886 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001887 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 return get_latin1_char((unsigned char)u[0]);
1889
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001890 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (!res)
1893 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001894 if (max_char < 256)
1895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1896 PyUnicode_1BYTE_DATA(res));
1897 else if (max_char < 0x10000)
1898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1899 PyUnicode_2BYTE_DATA(res));
1900 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
1904}
1905
1906PyObject*
1907PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1908{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001909 if (size < 0) {
1910 PyErr_SetString(PyExc_ValueError, "size must be positive");
1911 return NULL;
1912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 switch(kind) {
1914 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001917 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001919 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001921 PyErr_SetString(PyExc_SystemError, "invalid kind");
1922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924}
1925
Victor Stinner25a4b292011-10-06 12:31:55 +02001926/* Ensure that a string uses the most efficient storage, if it is not the
1927 case: create a new string with of the right kind. Write NULL into *p_unicode
1928 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001929static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001930unicode_adjust_maxchar(PyObject **p_unicode)
1931{
1932 PyObject *unicode, *copy;
1933 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001935 unsigned int kind;
1936
1937 assert(p_unicode != NULL);
1938 unicode = *p_unicode;
1939 assert(PyUnicode_IS_READY(unicode));
1940 if (PyUnicode_IS_ASCII(unicode))
1941 return;
1942
1943 len = PyUnicode_GET_LENGTH(unicode);
1944 kind = PyUnicode_KIND(unicode);
1945 if (kind == PyUnicode_1BYTE_KIND) {
1946 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001947 max_char = ucs1lib_find_max_char(u, u + len);
1948 if (max_char >= 128)
1949 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001950 }
1951 else if (kind == PyUnicode_2BYTE_KIND) {
1952 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 max_char = ucs2lib_find_max_char(u, u + len);
1954 if (max_char >= 256)
1955 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001956 }
1957 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001960 max_char = ucs4lib_find_max_char(u, u + len);
1961 if (max_char >= 0x10000)
1962 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001963 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001964 copy = PyUnicode_New(len, max_char);
1965 copy_characters(copy, 0, unicode, 0, len);
1966 Py_DECREF(unicode);
1967 *p_unicode = copy;
1968}
1969
Victor Stinner034f6cf2011-09-30 02:26:44 +02001970PyObject*
1971PyUnicode_Copy(PyObject *unicode)
1972{
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975
Victor Stinner034f6cf2011-09-30 02:26:44 +02001976 if (!PyUnicode_Check(unicode)) {
1977 PyErr_BadInternalCall();
1978 return NULL;
1979 }
1980 if (PyUnicode_READY(unicode))
1981 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982
Victor Stinner87af4f22011-11-21 23:03:47 +01001983 length = PyUnicode_GET_LENGTH(unicode);
1984 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001985 if (!copy)
1986 return NULL;
1987 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1988
Victor Stinner87af4f22011-11-21 23:03:47 +01001989 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1990 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001991 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001992 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001993}
1994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995
Victor Stinnerbc603d12011-10-02 01:00:40 +02001996/* Widen Unicode objects to larger buffers. Don't write terminating null
1997 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998
1999void*
2000_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2001{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002002 Py_ssize_t len;
2003 void *result;
2004 unsigned int skind;
2005
2006 if (PyUnicode_READY(s))
2007 return NULL;
2008
2009 len = PyUnicode_GET_LENGTH(s);
2010 skind = PyUnicode_KIND(s);
2011 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002012 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014 }
2015 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002016 case PyUnicode_2BYTE_KIND:
2017 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2018 if (!result)
2019 return PyErr_NoMemory();
2020 assert(skind == PyUnicode_1BYTE_KIND);
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS1, Py_UCS2,
2023 PyUnicode_1BYTE_DATA(s),
2024 PyUnicode_1BYTE_DATA(s) + len,
2025 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002027 case PyUnicode_4BYTE_KIND:
2028 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2029 if (!result)
2030 return PyErr_NoMemory();
2031 if (skind == PyUnicode_2BYTE_KIND) {
2032 _PyUnicode_CONVERT_BYTES(
2033 Py_UCS2, Py_UCS4,
2034 PyUnicode_2BYTE_DATA(s),
2035 PyUnicode_2BYTE_DATA(s) + len,
2036 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002038 else {
2039 assert(skind == PyUnicode_1BYTE_KIND);
2040 _PyUnicode_CONVERT_BYTES(
2041 Py_UCS1, Py_UCS4,
2042 PyUnicode_1BYTE_DATA(s),
2043 PyUnicode_1BYTE_DATA(s) + len,
2044 result);
2045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002047 default:
2048 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 }
Victor Stinner01698042011-10-04 00:04:26 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 return NULL;
2052}
2053
2054static Py_UCS4*
2055as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2056 int copy_null)
2057{
2058 int kind;
2059 void *data;
2060 Py_ssize_t len, targetlen;
2061 if (PyUnicode_READY(string) == -1)
2062 return NULL;
2063 kind = PyUnicode_KIND(string);
2064 data = PyUnicode_DATA(string);
2065 len = PyUnicode_GET_LENGTH(string);
2066 targetlen = len;
2067 if (copy_null)
2068 targetlen++;
2069 if (!target) {
2070 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2071 PyErr_NoMemory();
2072 return NULL;
2073 }
2074 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2075 if (!target) {
2076 PyErr_NoMemory();
2077 return NULL;
2078 }
2079 }
2080 else {
2081 if (targetsize < targetlen) {
2082 PyErr_Format(PyExc_SystemError,
2083 "string is longer than the buffer");
2084 if (copy_null && 0 < targetsize)
2085 target[0] = 0;
2086 return NULL;
2087 }
2088 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 if (kind == PyUnicode_1BYTE_KIND) {
2090 Py_UCS1 *start = (Py_UCS1 *) data;
2091 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002093 else if (kind == PyUnicode_2BYTE_KIND) {
2094 Py_UCS2 *start = (Py_UCS2 *) data;
2095 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2096 }
2097 else {
2098 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 if (copy_null)
2102 target[len] = 0;
2103 return target;
2104}
2105
2106Py_UCS4*
2107PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2108 int copy_null)
2109{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002110 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
2114 return as_ucs4(string, target, targetsize, copy_null);
2115}
2116
2117Py_UCS4*
2118PyUnicode_AsUCS4Copy(PyObject *string)
2119{
2120 return as_ucs4(string, NULL, 0, 1);
2121}
2122
2123#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002129 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002131 PyErr_BadInternalCall();
2132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 }
2134
Martin v. Löwis790465f2008-04-05 20:41:37 +00002135 if (size == -1) {
2136 size = wcslen(w);
2137 }
2138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140}
2141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002143
Walter Dörwald346737f2007-05-31 10:44:43 +00002144static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002145makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2146 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002148 *fmt++ = '%';
2149 if (width) {
2150 if (zeropad)
2151 *fmt++ = '0';
2152 fmt += sprintf(fmt, "%d", width);
2153 }
2154 if (precision)
2155 fmt += sprintf(fmt, ".%d", precision);
2156 if (longflag)
2157 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002158 else if (longlongflag) {
2159 /* longlongflag should only ever be nonzero on machines with
2160 HAVE_LONG_LONG defined */
2161#ifdef HAVE_LONG_LONG
2162 char *f = PY_FORMAT_LONG_LONG;
2163 while (*f)
2164 *fmt++ = *f++;
2165#else
2166 /* we shouldn't ever get here */
2167 assert(0);
2168 *fmt++ = 'l';
2169#endif
2170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 else if (size_tflag) {
2172 char *f = PY_FORMAT_SIZE_T;
2173 while (*f)
2174 *fmt++ = *f++;
2175 }
2176 *fmt++ = c;
2177 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002178}
2179
Victor Stinner96865452011-03-01 23:44:09 +00002180/* helper for PyUnicode_FromFormatV() */
2181
2182static const char*
2183parse_format_flags(const char *f,
2184 int *p_width, int *p_precision,
2185 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2186{
2187 int width, precision, longflag, longlongflag, size_tflag;
2188
2189 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2190 f++;
2191 width = 0;
2192 while (Py_ISDIGIT((unsigned)*f))
2193 width = (width*10) + *f++ - '0';
2194 precision = 0;
2195 if (*f == '.') {
2196 f++;
2197 while (Py_ISDIGIT((unsigned)*f))
2198 precision = (precision*10) + *f++ - '0';
2199 if (*f == '%') {
2200 /* "%.3%s" => f points to "3" */
2201 f--;
2202 }
2203 }
2204 if (*f == '\0') {
2205 /* bogus format "%.1" => go backward, f points to "1" */
2206 f--;
2207 }
2208 if (p_width != NULL)
2209 *p_width = width;
2210 if (p_precision != NULL)
2211 *p_precision = precision;
2212
2213 /* Handle %ld, %lu, %lld and %llu. */
2214 longflag = 0;
2215 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002216 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002217
2218 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002220 longflag = 1;
2221 ++f;
2222 }
2223#ifdef HAVE_LONG_LONG
2224 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002225 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002226 longlongflag = 1;
2227 f += 2;
2228 }
2229#endif
2230 }
2231 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002232 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002233 size_tflag = 1;
2234 ++f;
2235 }
2236 if (p_longflag != NULL)
2237 *p_longflag = longflag;
2238 if (p_longlongflag != NULL)
2239 *p_longlongflag = longlongflag;
2240 if (p_size_tflag != NULL)
2241 *p_size_tflag = size_tflag;
2242 return f;
2243}
2244
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245/* maximum number of characters required for output of %ld. 21 characters
2246 allows for 64-bit integers (in decimal) and an optional sign. */
2247#define MAX_LONG_CHARS 21
2248/* maximum number of characters required for output of %lld.
2249 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2250 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2251#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2252
Walter Dörwaldd2034312007-05-18 16:29:38 +00002253PyObject *
2254PyUnicode_FromFormatV(const char *format, va_list vargs)
2255{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 va_list count;
2257 Py_ssize_t callcount = 0;
2258 PyObject **callresults = NULL;
2259 PyObject **callresult = NULL;
2260 Py_ssize_t n = 0;
2261 int width = 0;
2262 int precision = 0;
2263 int zeropad;
2264 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002265 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2269 Py_UCS4 argmaxchar;
2270 Py_ssize_t numbersize = 0;
2271 char *numberresults = NULL;
2272 char *numberresult = NULL;
2273 Py_ssize_t i;
2274 int kind;
2275 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002277 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002278 /* step 1: count the number of %S/%R/%A/%s format specifications
2279 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2280 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002282 * also estimate a upper bound for all the number formats in the string,
2283 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 for (f = format; *f; f++) {
2286 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002287 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2289 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2290 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2291 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002294#ifdef HAVE_LONG_LONG
2295 if (longlongflag) {
2296 if (width < MAX_LONG_LONG_CHARS)
2297 width = MAX_LONG_LONG_CHARS;
2298 }
2299 else
2300#endif
2301 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2302 including sign. Decimal takes the most space. This
2303 isn't enough for octal. If a width is specified we
2304 need more (which we allocate later). */
2305 if (width < MAX_LONG_CHARS)
2306 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307
2308 /* account for the size + '\0' to separate numbers
2309 inside of the numberresults buffer */
2310 numbersize += (width + 1);
2311 }
2312 }
2313 else if ((unsigned char)*f > 127) {
2314 PyErr_Format(PyExc_ValueError,
2315 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2316 "string, got a non-ASCII byte: 0x%02x",
2317 (unsigned char)*f);
2318 return NULL;
2319 }
2320 }
2321 /* step 2: allocate memory for the results of
2322 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2323 if (callcount) {
2324 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2325 if (!callresults) {
2326 PyErr_NoMemory();
2327 return NULL;
2328 }
2329 callresult = callresults;
2330 }
2331 /* step 2.5: allocate memory for the results of formating numbers */
2332 if (numbersize) {
2333 numberresults = PyObject_Malloc(numbersize);
2334 if (!numberresults) {
2335 PyErr_NoMemory();
2336 goto fail;
2337 }
2338 numberresult = numberresults;
2339 }
2340
2341 /* step 3: format numbers and figure out how large a buffer we need */
2342 for (f = format; *f; f++) {
2343 if (*f == '%') {
2344 const char* p;
2345 int longflag;
2346 int longlongflag;
2347 int size_tflag;
2348 int numprinted;
2349
2350 p = f;
2351 zeropad = (f[1] == '0');
2352 f = parse_format_flags(f, &width, &precision,
2353 &longflag, &longlongflag, &size_tflag);
2354 switch (*f) {
2355 case 'c':
2356 {
2357 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002358 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 n++;
2360 break;
2361 }
2362 case '%':
2363 n++;
2364 break;
2365 case 'i':
2366 case 'd':
2367 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2368 width, precision, *f);
2369 if (longflag)
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, long));
2372#ifdef HAVE_LONG_LONG
2373 else if (longlongflag)
2374 numprinted = sprintf(numberresult, fmt,
2375 va_arg(count, PY_LONG_LONG));
2376#endif
2377 else if (size_tflag)
2378 numprinted = sprintf(numberresult, fmt,
2379 va_arg(count, Py_ssize_t));
2380 else
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, int));
2383 n += numprinted;
2384 /* advance by +1 to skip over the '\0' */
2385 numberresult += (numprinted + 1);
2386 assert(*(numberresult - 1) == '\0');
2387 assert(*(numberresult - 2) != '\0');
2388 assert(numprinted >= 0);
2389 assert(numberresult <= numberresults + numbersize);
2390 break;
2391 case 'u':
2392 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2393 width, precision, 'u');
2394 if (longflag)
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned long));
2397#ifdef HAVE_LONG_LONG
2398 else if (longlongflag)
2399 numprinted = sprintf(numberresult, fmt,
2400 va_arg(count, unsigned PY_LONG_LONG));
2401#endif
2402 else if (size_tflag)
2403 numprinted = sprintf(numberresult, fmt,
2404 va_arg(count, size_t));
2405 else
2406 numprinted = sprintf(numberresult, fmt,
2407 va_arg(count, unsigned int));
2408 n += numprinted;
2409 numberresult += (numprinted + 1);
2410 assert(*(numberresult - 1) == '\0');
2411 assert(*(numberresult - 2) != '\0');
2412 assert(numprinted >= 0);
2413 assert(numberresult <= numberresults + numbersize);
2414 break;
2415 case 'x':
2416 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2417 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2418 n += numprinted;
2419 numberresult += (numprinted + 1);
2420 assert(*(numberresult - 1) == '\0');
2421 assert(*(numberresult - 2) != '\0');
2422 assert(numprinted >= 0);
2423 assert(numberresult <= numberresults + numbersize);
2424 break;
2425 case 'p':
2426 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2427 /* %p is ill-defined: ensure leading 0x. */
2428 if (numberresult[1] == 'X')
2429 numberresult[1] = 'x';
2430 else if (numberresult[1] != 'x') {
2431 memmove(numberresult + 2, numberresult,
2432 strlen(numberresult) + 1);
2433 numberresult[0] = '0';
2434 numberresult[1] = 'x';
2435 numprinted += 2;
2436 }
2437 n += numprinted;
2438 numberresult += (numprinted + 1);
2439 assert(*(numberresult - 1) == '\0');
2440 assert(*(numberresult - 2) != '\0');
2441 assert(numprinted >= 0);
2442 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 break;
2444 case 's':
2445 {
2446 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002447 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002448 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2449 if (!str)
2450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 /* since PyUnicode_DecodeUTF8 returns already flexible
2452 unicode objects, there is no need to call ready on them */
2453 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002454 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002456 /* Remember the str and switch to the next slot */
2457 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 break;
2459 }
2460 case 'U':
2461 {
2462 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002463 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (PyUnicode_READY(obj) == -1)
2465 goto fail;
2466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 break;
2470 }
2471 case 'V':
2472 {
2473 PyObject *obj = va_arg(count, PyObject *);
2474 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002476 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002477 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (PyUnicode_READY(obj) == -1)
2480 goto fail;
2481 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002482 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002484 *callresult++ = NULL;
2485 }
2486 else {
2487 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2488 if (!str_obj)
2489 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002490 if (PyUnicode_READY(str_obj)) {
2491 Py_DECREF(str_obj);
2492 goto fail;
2493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002495 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002497 *callresult++ = str_obj;
2498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 break;
2500 }
2501 case 'S':
2502 {
2503 PyObject *obj = va_arg(count, PyObject *);
2504 PyObject *str;
2505 assert(obj);
2506 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002510 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 /* Remember the str and switch to the next slot */
2513 *callresult++ = str;
2514 break;
2515 }
2516 case 'R':
2517 {
2518 PyObject *obj = va_arg(count, PyObject *);
2519 PyObject *repr;
2520 assert(obj);
2521 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002525 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 /* Remember the repr and switch to the next slot */
2528 *callresult++ = repr;
2529 break;
2530 }
2531 case 'A':
2532 {
2533 PyObject *obj = va_arg(count, PyObject *);
2534 PyObject *ascii;
2535 assert(obj);
2536 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002540 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 /* Remember the repr and switch to the next slot */
2543 *callresult++ = ascii;
2544 break;
2545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 default:
2547 /* if we stumble upon an unknown
2548 formatting code, copy the rest of
2549 the format string to the output
2550 string. (we cannot just skip the
2551 code, since there's no way to know
2552 what's in the argument list) */
2553 n += strlen(p);
2554 goto expand;
2555 }
2556 } else
2557 n++;
2558 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 we don't have to resize the string.
2563 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002564 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (!string)
2566 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 kind = PyUnicode_KIND(string);
2568 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002574 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002575
2576 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2578 /* checking for == because the last argument could be a empty
2579 string, which causes i to point to end, the assert at the end of
2580 the loop */
2581 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002582
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 switch (*f) {
2584 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 const int ordinal = va_arg(vargs, int);
2587 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002589 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002590 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 case 'p':
2595 /* unused, since we already have the result */
2596 if (*f == 'p')
2597 (void) va_arg(vargs, void *);
2598 else
2599 (void) va_arg(vargs, int);
2600 /* extract the result from numberresults and append. */
2601 for (; *numberresult; ++i, ++numberresult)
2602 PyUnicode_WRITE(kind, data, i, *numberresult);
2603 /* skip over the separating '\0' */
2604 assert(*numberresult == '\0');
2605 numberresult++;
2606 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002607 break;
2608 case 's':
2609 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 size = PyUnicode_GET_LENGTH(*callresult);
2614 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002615 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002617 /* We're done with the unicode()/repr() => forget it */
2618 Py_DECREF(*callresult);
2619 /* switch to next unicode()/repr() result */
2620 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 Py_ssize_t size;
2627 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2628 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002629 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 break;
2632 }
2633 case 'V':
2634 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002637 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(obj);
2640 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002641 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 size = PyUnicode_GET_LENGTH(*callresult);
2645 assert(PyUnicode_KIND(*callresult) <=
2646 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002647 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002651 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 break;
2653 }
2654 case 'S':
2655 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002656 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* unused, since we already have the result */
2660 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002662 copy_characters(string, i, *callresult, 0, size);
2663 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 /* We're done with the unicode()/repr() => forget it */
2665 Py_DECREF(*callresult);
2666 /* switch to next unicode()/repr() result */
2667 ++callresult;
2668 break;
2669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 break;
2673 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 for (; *p; ++p, ++i)
2675 PyUnicode_WRITE(kind, data, i, *p);
2676 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 goto end;
2678 }
Victor Stinner1205f272010-09-11 00:54:47 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 else {
2681 assert(i < PyUnicode_GET_LENGTH(string));
2682 PyUnicode_WRITE(kind, data, i++, *f);
2683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 if (callresults)
2689 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 if (numberresults)
2691 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002692 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 if (callresults) {
2695 PyObject **callresult2 = callresults;
2696 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002697 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 ++callresult2;
2699 }
2700 PyObject_Free(callresults);
2701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 if (numberresults)
2703 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707PyObject *
2708PyUnicode_FromFormat(const char *format, ...)
2709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 PyObject* ret;
2711 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712
2713#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 ret = PyUnicode_FromFormatV(format, vargs);
2719 va_end(vargs);
2720 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721}
2722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723#ifdef HAVE_WCHAR_H
2724
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2726 convert a Unicode object to a wide character string.
2727
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 character) required to convert the unicode object. Ignore size argument.
2730
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002735unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002736 wchar_t *w,
2737 Py_ssize_t size)
2738{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 const wchar_t *wstr;
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (wstr == NULL)
2744 return -1;
2745
Victor Stinner5593d8a2010-10-02 11:11:27 +00002746 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 if (size > res)
2748 size = res + 1;
2749 else
2750 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 return res;
2753 }
2754 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002756}
2757
2758Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002759PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002760 wchar_t *w,
2761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762{
2763 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 PyErr_BadInternalCall();
2765 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002767 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768}
2769
Victor Stinner137c34c2010-09-29 10:25:54 +00002770wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002771PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002772 Py_ssize_t *size)
2773{
2774 wchar_t* buffer;
2775 Py_ssize_t buflen;
2776
2777 if (unicode == NULL) {
2778 PyErr_BadInternalCall();
2779 return NULL;
2780 }
2781
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002782 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 if (buflen == -1)
2784 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 PyErr_NoMemory();
2787 return NULL;
2788 }
2789
Victor Stinner137c34c2010-09-29 10:25:54 +00002790 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2791 if (buffer == NULL) {
2792 PyErr_NoMemory();
2793 return NULL;
2794 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002795 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 if (buflen == -1)
2797 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 if (size != NULL)
2799 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002800 return buffer;
2801}
2802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804
Alexander Belopolsky40018472011-02-26 01:02:56 +00002805PyObject *
2806PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 PyErr_SetString(PyExc_ValueError,
2811 "chr() arg not in range(0x110000)");
2812 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 if (ordinal < 256)
2816 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 v = PyUnicode_New(1, ordinal);
2819 if (v == NULL)
2820 return NULL;
2821 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002822 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002824}
2825
Alexander Belopolsky40018472011-02-26 01:02:56 +00002826PyObject *
2827PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002832 if (PyUnicode_READY(obj))
2833 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 Py_INCREF(obj);
2835 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
2837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 /* For a Unicode subtype that's not a Unicode object,
2839 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002840 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002841 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002842 PyErr_Format(PyExc_TypeError,
2843 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002844 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002845 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846}
2847
Alexander Belopolsky40018472011-02-26 01:02:56 +00002848PyObject *
2849PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002850 const char *encoding,
2851 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002853 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002857 PyErr_BadInternalCall();
2858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 /* Decoding bytes objects is the most common case and should be fast */
2862 if (PyBytes_Check(obj)) {
2863 if (PyBytes_GET_SIZE(obj) == 0) {
2864 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002865 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002866 }
2867 else {
2868 v = PyUnicode_Decode(
2869 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2870 encoding, errors);
2871 }
2872 return v;
2873 }
2874
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 PyErr_SetString(PyExc_TypeError,
2877 "decoding str is not supported");
2878 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002879 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2882 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2883 PyErr_Format(PyExc_TypeError,
2884 "coercing to str: need bytes, bytearray "
2885 "or buffer-like object, %.80s found",
2886 Py_TYPE(obj)->tp_name);
2887 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002892 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 }
Tim Petersced69f82003-09-16 20:30:58 +00002894 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002896
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002897 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002898 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899}
2900
Victor Stinner600d3be2010-06-10 12:00:55 +00002901/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002902 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2903 1 on success. */
2904static int
2905normalize_encoding(const char *encoding,
2906 char *lower,
2907 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002910 char *l;
2911 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002912
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002913 if (encoding == NULL) {
2914 strcpy(lower, "utf-8");
2915 return 1;
2916 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002917 e = encoding;
2918 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002919 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002920 while (*e) {
2921 if (l == l_end)
2922 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002923 if (Py_ISUPPER(*e)) {
2924 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002925 }
2926 else if (*e == '_') {
2927 *l++ = '-';
2928 e++;
2929 }
2930 else {
2931 *l++ = *e++;
2932 }
2933 }
2934 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002935 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 Py_ssize_t size,
2941 const char *encoding,
2942 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002943{
2944 PyObject *buffer = NULL, *unicode;
2945 Py_buffer info;
2946 char lower[11]; /* Enough for any encoding shortcut */
2947
Fred Drakee4315f52000-05-09 19:53:39 +00002948 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002949 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 if ((strcmp(lower, "utf-8") == 0) ||
2951 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002952 return PyUnicode_DecodeUTF8(s, size, errors);
2953 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002954 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002955 (strcmp(lower, "iso-8859-1") == 0))
2956 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002957#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002958 else if (strcmp(lower, "mbcs") == 0)
2959 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002960#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002961 else if (strcmp(lower, "ascii") == 0)
2962 return PyUnicode_DecodeASCII(s, size, errors);
2963 else if (strcmp(lower, "utf-16") == 0)
2964 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2965 else if (strcmp(lower, "utf-32") == 0)
2966 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002971 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002972 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002973 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 if (buffer == NULL)
2975 goto onError;
2976 unicode = PyCodec_Decode(buffer, encoding, errors);
2977 if (unicode == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(unicode)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002982 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 Py_DECREF(unicode);
2984 goto onError;
2985 }
2986 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002987 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 Py_XDECREF(buffer);
2991 return NULL;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Decode via the codec registry */
3010 v = PyCodec_Decode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003013 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
3020PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003021 const char *encoding,
3022 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023{
3024 PyObject *v;
3025
3026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 goto onError;
3029 }
3030
3031 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003033
3034 /* Decode via the codec registry */
3035 v = PyCodec_Decode(unicode, encoding, errors);
3036 if (v == NULL)
3037 goto onError;
3038 if (!PyUnicode_Check(v)) {
3039 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003040 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041 Py_TYPE(v)->tp_name);
3042 Py_DECREF(v);
3043 goto onError;
3044 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003045 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048 return NULL;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 Py_ssize_t size,
3054 const char *encoding,
3055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056{
3057 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 unicode = PyUnicode_FromUnicode(s, size);
3060 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3063 Py_DECREF(unicode);
3064 return v;
3065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071{
3072 PyObject *v;
3073
3074 if (!PyUnicode_Check(unicode)) {
3075 PyErr_BadArgument();
3076 goto onError;
3077 }
3078
3079 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081
3082 /* Encode via the codec registry */
3083 v = PyCodec_Encode(unicode, encoding, errors);
3084 if (v == NULL)
3085 goto onError;
3086 return v;
3087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089 return NULL;
3090}
3091
Victor Stinnerad158722010-10-27 00:25:46 +00003092PyObject *
3093PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003094{
Victor Stinner99b95382011-07-04 14:23:54 +02003095#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003097#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003099#else
Victor Stinner793b5312011-04-27 00:24:21 +02003100 PyInterpreterState *interp = PyThreadState_GET()->interp;
3101 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3102 cannot use it to encode and decode filenames before it is loaded. Load
3103 the Python codec requires to encode at least its own filename. Use the C
3104 version of the locale codec until the codec registry is initialized and
3105 the Python codec is loaded.
3106
3107 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3108 cannot only rely on it: check also interp->fscodec_initialized for
3109 subinterpreters. */
3110 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003111 return PyUnicode_AsEncodedString(unicode,
3112 Py_FileSystemDefaultEncoding,
3113 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003114 }
3115 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003116 /* locale encoding with surrogateescape */
3117 wchar_t *wchar;
3118 char *bytes;
3119 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003121
3122 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3123 if (wchar == NULL)
3124 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003125 bytes = _Py_wchar2char(wchar, &error_pos);
3126 if (bytes == NULL) {
3127 if (error_pos != (size_t)-1) {
3128 char *errmsg = strerror(errno);
3129 PyObject *exc = NULL;
3130 if (errmsg == NULL)
3131 errmsg = "Py_wchar2char() failed";
3132 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003133 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003134 error_pos, error_pos+1,
3135 errmsg);
3136 Py_XDECREF(exc);
3137 }
3138 else
3139 PyErr_NoMemory();
3140 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003141 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003142 }
3143 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003144
3145 bytes_obj = PyBytes_FromString(bytes);
3146 PyMem_Free(bytes);
3147 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003148 }
Victor Stinnerad158722010-10-27 00:25:46 +00003149#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156{
3157 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003158 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 }
Fred Drakee4315f52000-05-09 19:53:39 +00003164
Fred Drakee4315f52000-05-09 19:53:39 +00003165 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003166 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003167 if ((strcmp(lower, "utf-8") == 0) ||
3168 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003169 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003170 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003171 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003172 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003174 }
Victor Stinner37296e82010-06-10 13:36:23 +00003175 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003176 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003177 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003178 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003179#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003180 else if (strcmp(lower, "mbcs") == 0)
3181 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003182#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003183 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003184 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186
3187 /* Encode via the codec registry */
3188 v = PyCodec_Encode(unicode, encoding, errors);
3189 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003190 return NULL;
3191
3192 /* The normal path */
3193 if (PyBytes_Check(v))
3194 return v;
3195
3196 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003198 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003199 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003200
3201 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3202 "encoder %s returned bytearray instead of bytes",
3203 encoding);
3204 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003205 Py_DECREF(v);
3206 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003209 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3210 Py_DECREF(v);
3211 return b;
3212 }
3213
3214 PyErr_Format(PyExc_TypeError,
3215 "encoder did not return a bytes object (type=%.400s)",
3216 Py_TYPE(v)->tp_name);
3217 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218 return NULL;
3219}
3220
Alexander Belopolsky40018472011-02-26 01:02:56 +00003221PyObject *
3222PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003223 const char *encoding,
3224 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003225{
3226 PyObject *v;
3227
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_BadArgument();
3230 goto onError;
3231 }
3232
3233 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235
3236 /* Encode via the codec registry */
3237 v = PyCodec_Encode(unicode, encoding, errors);
3238 if (v == NULL)
3239 goto onError;
3240 if (!PyUnicode_Check(v)) {
3241 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003242 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003243 Py_TYPE(v)->tp_name);
3244 Py_DECREF(v);
3245 goto onError;
3246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return NULL;
3251}
3252
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003254PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003255 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003256 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3257}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003258
Christian Heimes5894ba72007-11-04 11:43:14 +00003259PyObject*
3260PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3261{
Victor Stinner99b95382011-07-04 14:23:54 +02003262#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003263 return PyUnicode_DecodeMBCS(s, size, NULL);
3264#elif defined(__APPLE__)
3265 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3266#else
Victor Stinner793b5312011-04-27 00:24:21 +02003267 PyInterpreterState *interp = PyThreadState_GET()->interp;
3268 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3269 cannot use it to encode and decode filenames before it is loaded. Load
3270 the Python codec requires to encode at least its own filename. Use the C
3271 version of the locale codec until the codec registry is initialized and
3272 the Python codec is loaded.
3273
3274 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3275 cannot only rely on it: check also interp->fscodec_initialized for
3276 subinterpreters. */
3277 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278 return PyUnicode_Decode(s, size,
3279 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003280 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003281 }
3282 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283 /* locale encoding with surrogateescape */
3284 wchar_t *wchar;
3285 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003286 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287
3288 if (s[size] != '\0' || size != strlen(s)) {
3289 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3290 return NULL;
3291 }
3292
Victor Stinner168e1172010-10-16 23:16:16 +00003293 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003294 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003295 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003296
Victor Stinner168e1172010-10-16 23:16:16 +00003297 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003298 PyMem_Free(wchar);
3299 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003300 }
Victor Stinnerad158722010-10-27 00:25:46 +00003301#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003302}
3303
Martin v. Löwis011e8422009-05-05 04:43:17 +00003304
3305int
3306PyUnicode_FSConverter(PyObject* arg, void* addr)
3307{
3308 PyObject *output = NULL;
3309 Py_ssize_t size;
3310 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003311 if (arg == NULL) {
3312 Py_DECREF(*(PyObject**)addr);
3313 return 1;
3314 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003315 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003316 output = arg;
3317 Py_INCREF(output);
3318 }
3319 else {
3320 arg = PyUnicode_FromObject(arg);
3321 if (!arg)
3322 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003324 Py_DECREF(arg);
3325 if (!output)
3326 return 0;
3327 if (!PyBytes_Check(output)) {
3328 Py_DECREF(output);
3329 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3330 return 0;
3331 }
3332 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003333 size = PyBytes_GET_SIZE(output);
3334 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003335 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003336 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003337 Py_DECREF(output);
3338 return 0;
3339 }
3340 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003341 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003342}
3343
3344
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345int
3346PyUnicode_FSDecoder(PyObject* arg, void* addr)
3347{
3348 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003349 if (arg == NULL) {
3350 Py_DECREF(*(PyObject**)addr);
3351 return 1;
3352 }
3353 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 if (PyUnicode_READY(arg))
3355 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003356 output = arg;
3357 Py_INCREF(output);
3358 }
3359 else {
3360 arg = PyBytes_FromObject(arg);
3361 if (!arg)
3362 return 0;
3363 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3364 PyBytes_GET_SIZE(arg));
3365 Py_DECREF(arg);
3366 if (!output)
3367 return 0;
3368 if (!PyUnicode_Check(output)) {
3369 Py_DECREF(output);
3370 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3371 return 0;
3372 }
3373 }
Victor Stinner065836e2011-10-27 01:56:33 +02003374 if (PyUnicode_READY(output) < 0) {
3375 Py_DECREF(output);
3376 return 0;
3377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003378 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003379 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003380 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3381 Py_DECREF(output);
3382 return 0;
3383 }
3384 *(PyObject**)addr = output;
3385 return Py_CLEANUP_SUPPORTED;
3386}
3387
3388
Martin v. Löwis5b222132007-06-10 09:51:05 +00003389char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003391{
Christian Heimesf3863112007-11-22 07:46:41 +00003392 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003394 if (!PyUnicode_Check(unicode)) {
3395 PyErr_BadArgument();
3396 return NULL;
3397 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003398 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003399 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003401 if (PyUnicode_UTF8(unicode) == NULL) {
3402 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3404 if (bytes == NULL)
3405 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003406 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3407 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 Py_DECREF(bytes);
3409 return NULL;
3410 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003411 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3412 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3413 PyBytes_AS_STRING(bytes),
3414 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 Py_DECREF(bytes);
3416 }
3417
3418 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003419 *psize = PyUnicode_UTF8_LENGTH(unicode);
3420 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003421}
3422
3423char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3427}
3428
3429#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003430static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431#endif
3432
3433
3434Py_UNICODE *
3435PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003437 const unsigned char *one_byte;
3438#if SIZEOF_WCHAR_T == 4
3439 const Py_UCS2 *two_bytes;
3440#else
3441 const Py_UCS4 *four_bytes;
3442 const Py_UCS4 *ucs4_end;
3443 Py_ssize_t num_surrogates;
3444#endif
3445 wchar_t *w;
3446 wchar_t *wchar_end;
3447
3448 if (!PyUnicode_Check(unicode)) {
3449 PyErr_BadArgument();
3450 return NULL;
3451 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 assert(_PyUnicode_KIND(unicode) != 0);
3455 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456
3457#ifdef Py_DEBUG
3458 ++unicode_as_unicode_calls;
3459#endif
3460
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003462#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003463 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3464 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 num_surrogates = 0;
3466
3467 for (; four_bytes < ucs4_end; ++four_bytes) {
3468 if (*four_bytes > 0xFFFF)
3469 ++num_surrogates;
3470 }
3471
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3473 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3474 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 PyErr_NoMemory();
3476 return NULL;
3477 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003478 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003480 w = _PyUnicode_WSTR(unicode);
3481 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3482 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3484 if (*four_bytes > 0xFFFF) {
3485 /* encode surrogate pair in this case */
3486 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3487 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3488 }
3489 else
3490 *w = *four_bytes;
3491
3492 if (w > wchar_end) {
3493 assert(0 && "Miscalculated string end");
3494 }
3495 }
3496 *w = 0;
3497#else
3498 /* sizeof(wchar_t) == 4 */
3499 Py_FatalError("Impossible unicode object state, wstr and str "
3500 "should share memory already.");
3501 return NULL;
3502#endif
3503 }
3504 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003505 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3506 (_PyUnicode_LENGTH(unicode) + 1));
3507 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 PyErr_NoMemory();
3509 return NULL;
3510 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003511 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3512 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3513 w = _PyUnicode_WSTR(unicode);
3514 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003516 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3517 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 for (; w < wchar_end; ++one_byte, ++w)
3519 *w = *one_byte;
3520 /* null-terminate the wstr */
3521 *w = 0;
3522 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003523 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003525 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 for (; w < wchar_end; ++two_bytes, ++w)
3527 *w = *two_bytes;
3528 /* null-terminate the wstr */
3529 *w = 0;
3530#else
3531 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003532 PyObject_FREE(_PyUnicode_WSTR(unicode));
3533 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 Py_FatalError("Impossible unicode object state, wstr "
3535 "and str should share memory already.");
3536 return NULL;
3537#endif
3538 }
3539 else {
3540 assert(0 && "This should never happen.");
3541 }
3542 }
3543 }
3544 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003545 *size = PyUnicode_WSTR_LENGTH(unicode);
3546 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003547}
3548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549Py_UNICODE *
3550PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553}
3554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556Py_ssize_t
3557PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558{
3559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_BadArgument();
3561 goto onError;
3562 }
3563 return PyUnicode_GET_SIZE(unicode);
3564
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 return -1;
3567}
3568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569Py_ssize_t
3570PyUnicode_GetLength(PyObject *unicode)
3571{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003572 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003573 PyErr_BadArgument();
3574 return -1;
3575 }
3576
3577 return PyUnicode_GET_LENGTH(unicode);
3578}
3579
3580Py_UCS4
3581PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3582{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003583 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3584 PyErr_BadArgument();
3585 return (Py_UCS4)-1;
3586 }
3587 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3588 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return (Py_UCS4)-1;
3590 }
3591 return PyUnicode_READ_CHAR(unicode, index);
3592}
3593
3594int
3595PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3596{
3597 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003598 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 return -1;
3600 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003601 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3602 PyErr_SetString(PyExc_IndexError, "string index out of range");
3603 return -1;
3604 }
3605 if (_PyUnicode_Dirty(unicode))
3606 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003607 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3608 index, ch);
3609 return 0;
3610}
3611
Alexander Belopolsky40018472011-02-26 01:02:56 +00003612const char *
3613PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003614{
Victor Stinner42cb4622010-09-01 19:39:01 +00003615 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003616}
3617
Victor Stinner554f3f02010-06-16 23:33:54 +00003618/* create or adjust a UnicodeDecodeError */
3619static void
3620make_decode_exception(PyObject **exceptionObject,
3621 const char *encoding,
3622 const char *input, Py_ssize_t length,
3623 Py_ssize_t startpos, Py_ssize_t endpos,
3624 const char *reason)
3625{
3626 if (*exceptionObject == NULL) {
3627 *exceptionObject = PyUnicodeDecodeError_Create(
3628 encoding, input, length, startpos, endpos, reason);
3629 }
3630 else {
3631 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3632 goto onError;
3633 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3634 goto onError;
3635 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3636 goto onError;
3637 }
3638 return;
3639
3640onError:
3641 Py_DECREF(*exceptionObject);
3642 *exceptionObject = NULL;
3643}
3644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645/* error handling callback helper:
3646 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003647 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 and adjust various state variables.
3649 return 0 on success, -1 on error
3650*/
3651
Alexander Belopolsky40018472011-02-26 01:02:56 +00003652static int
3653unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003654 const char *encoding, const char *reason,
3655 const char **input, const char **inend, Py_ssize_t *startinpos,
3656 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003657 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003659 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660
3661 PyObject *restuple = NULL;
3662 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003663 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t requiredsize;
3666 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003667 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 int res = -1;
3669
Victor Stinner596a6c42011-11-09 00:02:18 +01003670 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3671 outsize = PyUnicode_GET_LENGTH(*output);
3672 else
3673 outsize = _PyUnicode_WSTR_LENGTH(*output);
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 *errorHandler = PyCodec_LookupError(errors);
3677 if (*errorHandler == NULL)
3678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 }
3680
Victor Stinner554f3f02010-06-16 23:33:54 +00003681 make_decode_exception(exceptionObject,
3682 encoding,
3683 *input, *inend - *input,
3684 *startinpos, *endinpos,
3685 reason);
3686 if (*exceptionObject == NULL)
3687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688
3689 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003693 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 }
3696 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003698 if (PyUnicode_READY(repunicode) < 0)
3699 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003700
3701 /* Copy back the bytes variables, which might have been modified by the
3702 callback */
3703 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3704 if (!inputobj)
3705 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003709 *input = PyBytes_AS_STRING(inputobj);
3710 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003711 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003712 /* we can DECREF safely, as the exception has another reference,
3713 so the object won't go away. */
3714 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3720 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722
Victor Stinner596a6c42011-11-09 00:02:18 +01003723 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3724 /* need more space? (at least enough for what we
3725 have+the replacement+the rest of the string (starting
3726 at the new input position), so we won't have to check space
3727 when there are no errors in the rest of the string) */
3728 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3729 requiredsize = *outpos + replen + insize-newpos;
3730 if (requiredsize > outsize) {
3731 if (requiredsize<2*outsize)
3732 requiredsize = 2*outsize;
3733 if (unicode_resize(output, requiredsize) < 0)
3734 goto onError;
3735 }
3736 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 copy_characters(*output, *outpos, repunicode, 0, replen);
3739 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003741 else {
3742 wchar_t *repwstr;
3743 Py_ssize_t repwlen;
3744 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3745 if (repwstr == NULL)
3746 goto onError;
3747 /* need more space? (at least enough for what we
3748 have+the replacement+the rest of the string (starting
3749 at the new input position), so we won't have to check space
3750 when there are no errors in the rest of the string) */
3751 requiredsize = *outpos + repwlen + insize-newpos;
3752 if (requiredsize > outsize) {
3753 if (requiredsize < 2*outsize)
3754 requiredsize = 2*outsize;
3755 if (unicode_resize(output, requiredsize) < 0)
3756 goto onError;
3757 }
3758 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3759 *outpos += repwlen;
3760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003762 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 /* we made it! */
3765 res = 0;
3766
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 Py_XDECREF(restuple);
3769 return res;
3770}
3771
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003772/* --- UTF-7 Codec -------------------------------------------------------- */
3773
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774/* See RFC2152 for details. We encode conservatively and decode liberally. */
3775
3776/* Three simple macros defining base-64. */
3777
3778/* Is c a base-64 character? */
3779
3780#define IS_BASE64(c) \
3781 (((c) >= 'A' && (c) <= 'Z') || \
3782 ((c) >= 'a' && (c) <= 'z') || \
3783 ((c) >= '0' && (c) <= '9') || \
3784 (c) == '+' || (c) == '/')
3785
3786/* given that c is a base-64 character, what is its base-64 value? */
3787
3788#define FROM_BASE64(c) \
3789 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3790 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3791 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3792 (c) == '+' ? 62 : 63)
3793
3794/* What is the base-64 character of the bottom 6 bits of n? */
3795
3796#define TO_BASE64(n) \
3797 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3798
3799/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3800 * decoded as itself. We are permissive on decoding; the only ASCII
3801 * byte not decoding to itself is the + which begins a base64
3802 * string. */
3803
3804#define DECODE_DIRECT(c) \
3805 ((c) <= 127 && (c) != '+')
3806
3807/* The UTF-7 encoder treats ASCII characters differently according to
3808 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3809 * the above). See RFC2152. This array identifies these different
3810 * sets:
3811 * 0 : "Set D"
3812 * alphanumeric and '(),-./:?
3813 * 1 : "Set O"
3814 * !"#$%&*;<=>@[]^_`{|}
3815 * 2 : "whitespace"
3816 * ht nl cr sp
3817 * 3 : special (must be base64 encoded)
3818 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3819 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820
Tim Petersced69f82003-09-16 20:30:58 +00003821static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822char utf7_category[128] = {
3823/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3824 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3825/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3826 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3827/* sp ! " # $ % & ' ( ) * + , - . / */
3828 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3829/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3831/* @ A B C D E F G H I J K L M N O */
3832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3833/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3835/* ` a b c d e f g h i j k l m n o */
3836 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3837/* p q r s t u v w x y z { | } ~ del */
3838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003839};
3840
Antoine Pitrou244651a2009-05-04 18:56:13 +00003841/* ENCODE_DIRECT: this character should be encoded as itself. The
3842 * answer depends on whether we are encoding set O as itself, and also
3843 * on whether we are encoding whitespace as itself. RFC2152 makes it
3844 * clear that the answers to these questions vary between
3845 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847#define ENCODE_DIRECT(c, directO, directWS) \
3848 ((c) < 128 && (c) > 0 && \
3849 ((utf7_category[(c)] == 0) || \
3850 (directWS && (utf7_category[(c)] == 2)) || \
3851 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852
Alexander Belopolsky40018472011-02-26 01:02:56 +00003853PyObject *
3854PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003855 Py_ssize_t size,
3856 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003858 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3859}
3860
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861/* The decoder. The only state we preserve is our read position,
3862 * i.e. how many characters we have consumed. So if we end in the
3863 * middle of a shift sequence we have to back off the read position
3864 * and the output to the beginning of the sequence, otherwise we lose
3865 * all the shift state (seen bits, number of bits seen, high
3866 * surrogate). */
3867
Alexander Belopolsky40018472011-02-26 01:02:56 +00003868PyObject *
3869PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003870 Py_ssize_t size,
3871 const char *errors,
3872 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003875 Py_ssize_t startinpos;
3876 Py_ssize_t endinpos;
3877 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003879 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 const char *errmsg = "";
3881 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003882 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 unsigned int base64bits = 0;
3884 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003885 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 PyObject *errorHandler = NULL;
3887 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 /* Start off assuming it's all ASCII. Widen later as necessary. */
3890 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 if (!unicode)
3892 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003893 if (size == 0) {
3894 if (consumed)
3895 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003896 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003897 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003900 e = s + size;
3901
3902 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003903 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003905 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003906
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 if (inShift) { /* in a base-64 section */
3908 if (IS_BASE64(ch)) { /* consume a base-64 character */
3909 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3910 base64bits += 6;
3911 s++;
3912 if (base64bits >= 16) {
3913 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003914 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 base64bits -= 16;
3916 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3917 if (surrogate) {
3918 /* expecting a second surrogate */
3919 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003920 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3921 | (outCh & 0x3FF)) + 0x10000;
3922 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3923 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003925 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 }
3927 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003928 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 }
3932 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003933 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934 /* first surrogate */
3935 surrogate = outCh;
3936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003938 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3939 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 }
3941 }
3942 }
3943 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 inShift = 0;
3945 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003946 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003947 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3948 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003949 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 if (base64bits > 0) { /* left-over bits */
3952 if (base64bits >= 6) {
3953 /* We've seen at least one base-64 character */
3954 errmsg = "partial character in shift sequence";
3955 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 else {
3958 /* Some bits remain; they should be zero */
3959 if (base64buffer != 0) {
3960 errmsg = "non-zero padding bits in shift sequence";
3961 goto utf7Error;
3962 }
3963 }
3964 }
3965 if (ch != '-') {
3966 /* '-' is absorbed; other terminating
3967 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3969 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 }
3972 }
3973 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975 s++; /* consume '+' */
3976 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3979 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 }
3981 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003983 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985 }
3986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3989 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 s++;
3991 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003992 else {
3993 startinpos = s-starts;
3994 s++;
3995 errmsg = "unexpected special character";
3996 goto utf7Error;
3997 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 endinpos = s-starts;
4001 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 errors, &errorHandler,
4003 "utf7", errmsg,
4004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004005 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007 }
4008
Antoine Pitrou244651a2009-05-04 18:56:13 +00004009 /* end of string */
4010
4011 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4012 /* if we're in an inconsistent state, that's an error */
4013 if (surrogate ||
4014 (base64bits >= 6) ||
4015 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 endinpos = size;
4017 if (unicode_decode_call_errorhandler(
4018 errors, &errorHandler,
4019 "utf7", "unterminated shift sequence",
4020 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004022 goto onError;
4023 if (s < e)
4024 goto restart;
4025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027
4028 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004031 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004032 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 }
4034 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004035 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004036 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004037 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004039 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 goto onError;
4041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(errorHandler);
4043 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004044 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 Py_XDECREF(errorHandler);
4048 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049 Py_DECREF(unicode);
4050 return NULL;
4051}
4052
4053
Alexander Belopolsky40018472011-02-26 01:02:56 +00004054PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055_PyUnicode_EncodeUTF7(PyObject *str,
4056 int base64SetO,
4057 int base64WhiteSpace,
4058 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 int kind;
4061 void *data;
4062 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004067 unsigned int base64bits = 0;
4068 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 char * out;
4070 char * start;
4071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004072 if (PyUnicode_READY(str) < 0)
4073 return NULL;
4074 kind = PyUnicode_KIND(str);
4075 data = PyUnicode_DATA(str);
4076 len = PyUnicode_GET_LENGTH(str);
4077
4078 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004080
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 /* It might be possible to tighten this worst case */
4082 allocated = 8 * len;
4083 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004084 return PyErr_NoMemory();
4085
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004087 if (v == NULL)
4088 return NULL;
4089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004090 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004091 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004092 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093
Antoine Pitrou244651a2009-05-04 18:56:13 +00004094 if (inShift) {
4095 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4096 /* shifting out */
4097 if (base64bits) { /* output remaining bits */
4098 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4099 base64buffer = 0;
4100 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004101 }
4102 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004103 /* Characters not in the BASE64 set implicitly unshift the sequence
4104 so no '-' is required, except if the character is itself a '-' */
4105 if (IS_BASE64(ch) || ch == '-') {
4106 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 *out++ = (char) ch;
4109 }
4110 else {
4111 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004112 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004113 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004114 else { /* not in a shift sequence */
4115 if (ch == '+') {
4116 *out++ = '+';
4117 *out++ = '-';
4118 }
4119 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4120 *out++ = (char) ch;
4121 }
4122 else {
4123 *out++ = '+';
4124 inShift = 1;
4125 goto encode_char;
4126 }
4127 }
4128 continue;
4129encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004130 if (ch >= 0x10000) {
4131 /* code first surrogate */
4132 base64bits += 16;
4133 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4134 while (base64bits >= 6) {
4135 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4136 base64bits -= 6;
4137 }
4138 /* prepare second surrogate */
4139 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 base64bits += 16;
4142 base64buffer = (base64buffer << 16) | ch;
4143 while (base64bits >= 6) {
4144 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4145 base64bits -= 6;
4146 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148 if (base64bits)
4149 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4150 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004152 if (_PyBytes_Resize(&v, out - start) < 0)
4153 return NULL;
4154 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004155}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004156PyObject *
4157PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4158 Py_ssize_t size,
4159 int base64SetO,
4160 int base64WhiteSpace,
4161 const char *errors)
4162{
4163 PyObject *result;
4164 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4165 if (tmp == NULL)
4166 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004167 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004168 base64WhiteSpace, errors);
4169 Py_DECREF(tmp);
4170 return result;
4171}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Antoine Pitrou244651a2009-05-04 18:56:13 +00004173#undef IS_BASE64
4174#undef FROM_BASE64
4175#undef TO_BASE64
4176#undef DECODE_DIRECT
4177#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179/* --- UTF-8 Codec -------------------------------------------------------- */
4180
Tim Petersced69f82003-09-16 20:30:58 +00004181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4184 illegal prefix. See RFC 3629 for details */
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4197 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4200 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201};
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald69652032004-09-07 20:24:22 +00004208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4209}
4210
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211#include "stringlib/ucs1lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
4215#include "stringlib/ucs2lib.h"
4216#include "stringlib/codecs.h"
4217#include "stringlib/undef.h"
4218
4219#include "stringlib/ucs4lib.h"
4220#include "stringlib/codecs.h"
4221#include "stringlib/undef.h"
4222
Antoine Pitrouab868312009-01-10 15:40:25 +00004223/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4224#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4225
4226/* Mask to quickly check whether a C 'long' contains a
4227 non-ASCII, UTF8-encoded char. */
4228#if (SIZEOF_LONG == 8)
4229# define ASCII_CHAR_MASK 0x8080808080808080L
4230#elif (SIZEOF_LONG == 4)
4231# define ASCII_CHAR_MASK 0x80808080L
4232#else
4233# error C 'long' size should be either 4 or 8!
4234#endif
4235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236/* Scans a UTF-8 string and returns the maximum character to be expected
4237 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004239 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 */
4242static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4244 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247 const unsigned char *p = (const unsigned char *)s;
4248 const unsigned char *end = p + string_size;
4249 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004251 assert(unicode_size != NULL);
4252
4253 /* By having a cascade of independent loops which fallback onto each
4254 other, we minimize the amount of work done in the average loop
4255 iteration, and we also maximize the CPU's ability to predict
4256 branches correctly (because a given condition will have always the
4257 same boolean outcome except perhaps in the last iteration of the
4258 corresponding loop).
4259 In the general case this brings us rather close to decoding
4260 performance pre-PEP 393, despite the two-pass decoding.
4261
4262 Note that the pure ASCII loop is not duplicated once a non-ASCII
4263 character has been encountered. It is actually a pessimization (by
4264 a significant factor) to use this loop on text with many non-ASCII
4265 characters, and it is important to avoid bad performance on valid
4266 utf-8 data (invalid utf-8 being a different can of worms).
4267 */
4268
4269 /* ASCII */
4270 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* Only check value if it's not a ASCII char... */
4272 if (*p < 0x80) {
4273 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4274 an explanation. */
4275 if (!((size_t) p & LONG_PTR_MASK)) {
4276 /* Help register allocation */
4277 register const unsigned char *_p = p;
4278 while (_p < aligned_end) {
4279 unsigned long value = *(unsigned long *) _p;
4280 if (value & ASCII_CHAR_MASK)
4281 break;
4282 _p += SIZEOF_LONG;
4283 char_count += SIZEOF_LONG;
4284 }
4285 p = _p;
4286 if (p == end)
4287 break;
4288 }
4289 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290 if (*p < 0x80)
4291 ++char_count;
4292 else
4293 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295 *unicode_size = char_count;
4296 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004298_ucs1loop:
4299 for (; p < end; ++p) {
4300 if (*p < 0xc4)
4301 char_count += ((*p & 0xc0) != 0x80);
4302 else
4303 goto _ucs2loop;
4304 }
4305 *unicode_size = char_count;
4306 return 255;
4307
4308_ucs2loop:
4309 for (; p < end; ++p) {
4310 if (*p < 0xf0)
4311 char_count += ((*p & 0xc0) != 0x80);
4312 else
4313 goto _ucs4loop;
4314 }
4315 *unicode_size = char_count;
4316 return 65535;
4317
4318_ucs4loop:
4319 for (; p < end; ++p) {
4320 char_count += ((*p & 0xc0) != 0x80);
4321 }
4322 *unicode_size = char_count;
4323 return 65537;
4324}
4325
4326/* Called when we encountered some error that wasn't detected in the original
4327 scan, e.g. an encoded surrogate character. The original maxchar computation
4328 may have been incorrect, so redo it. */
4329static int
4330refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4331{
4332 PyObject *tmp;
4333 Py_ssize_t k, maxchar;
4334 for (k = 0, maxchar = 0; k < n; k++)
4335 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4336 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4337 if (tmp == NULL)
4338 return -1;
4339 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4340 Py_DECREF(*unicode);
4341 *unicode = tmp;
4342 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343}
4344
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004345/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4346 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4347 onError. Potential resizing overallocates, so the result needs to shrink
4348 at the end.
4349*/
4350#define WRITE_MAYBE_FAIL(index, value) \
4351 do { \
4352 if (has_errors) { \
4353 Py_ssize_t pos = index; \
4354 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4355 unicode_resize(&unicode, pos + pos/8) < 0) \
4356 goto onError; \
4357 if (unicode_putchar(&unicode, &pos, value) < 0) \
4358 goto onError; \
4359 } \
4360 else \
4361 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 } while (0)
4363
Alexander Belopolsky40018472011-02-26 01:02:56 +00004364PyObject *
4365PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004366 Py_ssize_t size,
4367 const char *errors,
4368 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004369{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004372 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373 Py_ssize_t startinpos;
4374 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004375 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004376 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004377 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 Py_UCS4 maxchar = 0;
4381 Py_ssize_t unicode_size;
4382 Py_ssize_t i;
4383 int kind;
4384 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004385 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
Walter Dörwald69652032004-09-07 20:24:22 +00004387 if (size == 0) {
4388 if (consumed)
4389 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004390 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004391 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004392 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004393 /* When the string is ASCII only, just use memcpy and return.
4394 unicode_size may be != size if there is an incomplete UTF-8
4395 sequence at the end of the ASCII block. */
4396 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004397 if (consumed)
4398 *consumed = size;
4399
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004400 if (size == 1)
4401 return get_latin1_char((unsigned char)s[0]);
4402
4403 unicode = PyUnicode_New(unicode_size, maxchar);
4404 if (!unicode)
4405 return NULL;
4406 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4407 assert(_PyUnicode_CheckConsistency(unicode, 1));
4408 return unicode;
4409 }
4410
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004411 /* In case of errors, maxchar and size computation might be incorrect;
4412 code below refits and resizes as necessary. */
4413 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 if (!unicode)
4415 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004416 kind = PyUnicode_KIND(unicode);
4417 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004422 switch (kind) {
4423 case PyUnicode_1BYTE_KIND:
4424 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4425 break;
4426 case PyUnicode_2BYTE_KIND:
4427 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4428 break;
4429 case PyUnicode_4BYTE_KIND:
4430 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4431 break;
4432 }
4433 if (!has_errors) {
4434 /* Ensure the unicode size calculation was correct */
4435 assert(i == unicode_size);
4436 assert(s == e);
4437 if (consumed)
4438 *consumed = s-starts;
4439 return unicode;
4440 }
4441 /* Fall through to the generic decoding loop for the rest of
4442 the string */
4443 if (refit_partial_string(&unicode, kind, data, i) < 0)
4444 goto onError;
4445
Antoine Pitrouab868312009-01-10 15:40:25 +00004446 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004449 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
4451 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004452 /* Fast path for runs of ASCII characters. Given that common UTF-8
4453 input will consist of an overwhelming majority of ASCII
4454 characters, we try to optimize for this case by checking
4455 as many characters as a C 'long' can contain.
4456 First, check if we can do an aligned read, as most CPUs have
4457 a penalty for unaligned reads.
4458 */
4459 if (!((size_t) s & LONG_PTR_MASK)) {
4460 /* Help register allocation */
4461 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004463 while (_s < aligned_end) {
4464 /* Read a whole long at a time (either 4 or 8 bytes),
4465 and do a fast unrolled copy if it only contains ASCII
4466 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 unsigned long value = *(unsigned long *) _s;
4468 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004469 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004470 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4471 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4472 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4473 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004474#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4476 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4477 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4478 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004479#endif
4480 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004482 }
4483 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004484 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004485 if (s == e)
4486 break;
4487 ch = (unsigned char)*s;
4488 }
4489 }
4490
4491 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004492 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 s++;
4494 continue;
4495 }
4496
4497 n = utf8_code_length[ch];
4498
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004499 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 if (consumed)
4501 break;
4502 else {
4503 errmsg = "unexpected end of data";
4504 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004505 endinpos = startinpos+1;
4506 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4507 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 goto utf8Error;
4509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511
4512 switch (n) {
4513
4514 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004515 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 startinpos = s-starts;
4517 endinpos = startinpos+1;
4518 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
4520 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004521 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 startinpos = s-starts;
4523 endinpos = startinpos+1;
4524 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
4526 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004527 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004528 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004530 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 goto utf8Error;
4532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004534 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004535 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 break;
4537
4538 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004539 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4540 will result in surrogates in range d800-dfff. Surrogates are
4541 not valid UTF-8 so they are rejected.
4542 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4543 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004544 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 (s[2] & 0xc0) != 0x80 ||
4546 ((unsigned char)s[0] == 0xE0 &&
4547 (unsigned char)s[1] < 0xA0) ||
4548 ((unsigned char)s[0] == 0xED &&
4549 (unsigned char)s[1] > 0x9F)) {
4550 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004552 endinpos = startinpos + 1;
4553
4554 /* if s[1] first two bits are 1 and 0, then the invalid
4555 continuation byte is s[2], so increment endinpos by 1,
4556 if not, s[1] is invalid and endinpos doesn't need to
4557 be incremented. */
4558 if ((s[1] & 0xC0) == 0x80)
4559 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 goto utf8Error;
4561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004564 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004565 break;
4566
4567 case 4:
4568 if ((s[1] & 0xc0) != 0x80 ||
4569 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004570 (s[3] & 0xc0) != 0x80 ||
4571 ((unsigned char)s[0] == 0xF0 &&
4572 (unsigned char)s[1] < 0x90) ||
4573 ((unsigned char)s[0] == 0xF4 &&
4574 (unsigned char)s[1] > 0x8F)) {
4575 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004577 endinpos = startinpos + 1;
4578 if ((s[1] & 0xC0) == 0x80) {
4579 endinpos++;
4580 if ((s[2] & 0xC0) == 0x80)
4581 endinpos++;
4582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 goto utf8Error;
4584 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004585 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004586 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4587 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4588
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 }
4592 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004594
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004596 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004597 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004598 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 if (unicode_decode_call_errorhandler(
4602 errors, &errorHandler,
4603 "utf8", errmsg,
4604 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004607 /* Update data because unicode_decode_call_errorhandler might have
4608 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004609 data = PyUnicode_DATA(unicode);
4610 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004615
Walter Dörwald69652032004-09-07 20:24:22 +00004616 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004619 /* Adjust length and ready string when it contained errors and
4620 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004622 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623 goto onError;
4624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_XDECREF(errorHandler);
4627 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004628 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004629 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 Py_XDECREF(errorHandler);
4633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 Py_DECREF(unicode);
4635 return NULL;
4636}
4637
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004638#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004639
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004640#ifdef __APPLE__
4641
4642/* Simplified UTF-8 decoder using surrogateescape error handler,
4643 used to decode the command line arguments on Mac OS X. */
4644
4645wchar_t*
4646_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4647{
4648 int n;
4649 const char *e;
4650 wchar_t *unicode, *p;
4651
4652 /* Note: size will always be longer than the resulting Unicode
4653 character count */
4654 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4655 PyErr_NoMemory();
4656 return NULL;
4657 }
4658 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4659 if (!unicode)
4660 return NULL;
4661
4662 /* Unpack UTF-8 encoded data */
4663 p = unicode;
4664 e = s + size;
4665 while (s < e) {
4666 Py_UCS4 ch = (unsigned char)*s;
4667
4668 if (ch < 0x80) {
4669 *p++ = (wchar_t)ch;
4670 s++;
4671 continue;
4672 }
4673
4674 n = utf8_code_length[ch];
4675 if (s + n > e) {
4676 goto surrogateescape;
4677 }
4678
4679 switch (n) {
4680 case 0:
4681 case 1:
4682 goto surrogateescape;
4683
4684 case 2:
4685 if ((s[1] & 0xc0) != 0x80)
4686 goto surrogateescape;
4687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4688 assert ((ch > 0x007F) && (ch <= 0x07FF));
4689 *p++ = (wchar_t)ch;
4690 break;
4691
4692 case 3:
4693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4694 will result in surrogates in range d800-dfff. Surrogates are
4695 not valid UTF-8 so they are rejected.
4696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4698 if ((s[1] & 0xc0) != 0x80 ||
4699 (s[2] & 0xc0) != 0x80 ||
4700 ((unsigned char)s[0] == 0xE0 &&
4701 (unsigned char)s[1] < 0xA0) ||
4702 ((unsigned char)s[0] == 0xED &&
4703 (unsigned char)s[1] > 0x9F)) {
4704
4705 goto surrogateescape;
4706 }
4707 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4708 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004709 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 break;
4711
4712 case 4:
4713 if ((s[1] & 0xc0) != 0x80 ||
4714 (s[2] & 0xc0) != 0x80 ||
4715 (s[3] & 0xc0) != 0x80 ||
4716 ((unsigned char)s[0] == 0xF0 &&
4717 (unsigned char)s[1] < 0x90) ||
4718 ((unsigned char)s[0] == 0xF4 &&
4719 (unsigned char)s[1] > 0x8F)) {
4720 goto surrogateescape;
4721 }
4722 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4723 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4724 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4725
4726#if SIZEOF_WCHAR_T == 4
4727 *p++ = (wchar_t)ch;
4728#else
4729 /* compute and append the two surrogates: */
4730
4731 /* translate from 10000..10FFFF to 0..FFFF */
4732 ch -= 0x10000;
4733
4734 /* high surrogate = top 10 bits added to D800 */
4735 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4736
4737 /* low surrogate = bottom 10 bits added to DC00 */
4738 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4739#endif
4740 break;
4741 }
4742 s += n;
4743 continue;
4744
4745 surrogateescape:
4746 *p++ = 0xDC00 + ch;
4747 s++;
4748 }
4749 *p = L'\0';
4750 return unicode;
4751}
4752
4753#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004755/* Primary internal function which creates utf8 encoded bytes objects.
4756
4757 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004758 and allocate exactly as much space needed at the end. Else allocate the
4759 maximum possible needed (4 result bytes per Unicode character), and return
4760 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004761*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004762PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004763_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764{
Tim Peters602f7402002-04-27 18:03:26 +00004765#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004766
Guido van Rossum98297ee2007-11-06 21:34:58 +00004767 Py_ssize_t i; /* index into s of next input byte */
4768 PyObject *result; /* result string object */
4769 char *p; /* next free byte in output buffer */
4770 Py_ssize_t nallocated; /* number of result bytes allocated */
4771 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004772 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004773 PyObject *errorHandler = NULL;
4774 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 int kind;
4776 void *data;
4777 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004778 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 if (!PyUnicode_Check(unicode)) {
4781 PyErr_BadArgument();
4782 return NULL;
4783 }
4784
4785 if (PyUnicode_READY(unicode) == -1)
4786 return NULL;
4787
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004788 if (PyUnicode_UTF8(unicode))
4789 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4790 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791
4792 kind = PyUnicode_KIND(unicode);
4793 data = PyUnicode_DATA(unicode);
4794 size = PyUnicode_GET_LENGTH(unicode);
4795
Tim Peters602f7402002-04-27 18:03:26 +00004796 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
Tim Peters602f7402002-04-27 18:03:26 +00004798 if (size <= MAX_SHORT_UNICHARS) {
4799 /* Write into the stack buffer; nallocated can't overflow.
4800 * At the end, we'll allocate exactly as much heap space as it
4801 * turns out we need.
4802 */
4803 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004804 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004805 p = stackbuf;
4806 }
4807 else {
4808 /* Overallocate on the heap, and give the excess back at the end. */
4809 nallocated = size * 4;
4810 if (nallocated / 4 != size) /* overflow! */
4811 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004812 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004813 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004814 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004815 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004816 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004817
Tim Peters602f7402002-04-27 18:03:26 +00004818 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004820
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004821 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004822 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004824
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004826 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004827 *p++ = (char)(0xc0 | (ch >> 6));
4828 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004829 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 Py_ssize_t repsize, k, startpos;
4832 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004833 rep = unicode_encode_call_errorhandler(
4834 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004835 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 if (!rep)
4837 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839 if (PyBytes_Check(rep))
4840 repsize = PyBytes_GET_SIZE(rep);
4841 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004842 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843
4844 if (repsize > 4) {
4845 Py_ssize_t offset;
4846
4847 if (result == NULL)
4848 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004849 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004852 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4853 /* integer overflow */
4854 PyErr_NoMemory();
4855 goto error;
4856 }
4857 nallocated += repsize - 4;
4858 if (result != NULL) {
4859 if (_PyBytes_Resize(&result, nallocated) < 0)
4860 goto error;
4861 } else {
4862 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004863 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004864 goto error;
4865 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4866 }
4867 p = PyBytes_AS_STRING(result) + offset;
4868 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870 if (PyBytes_Check(rep)) {
4871 char *prep = PyBytes_AS_STRING(rep);
4872 for(k = repsize; k > 0; k--)
4873 *p++ = *prep++;
4874 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004875 enum PyUnicode_Kind repkind;
4876 void *repdata;
4877
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004878 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004879 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004880 repkind = PyUnicode_KIND(rep);
4881 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
4883 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004884 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004885 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004886 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004887 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004888 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004889 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004890 goto error;
4891 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004892 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004893 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004894 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004895 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004896 } else if (ch < 0x10000) {
4897 *p++ = (char)(0xe0 | (ch >> 12));
4898 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4899 *p++ = (char)(0x80 | (ch & 0x3f));
4900 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004901 /* Encode UCS4 Unicode ordinals */
4902 *p++ = (char)(0xf0 | (ch >> 18));
4903 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4904 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4905 *p++ = (char)(0x80 | (ch & 0x3f));
4906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004908
Guido van Rossum98297ee2007-11-06 21:34:58 +00004909 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004910 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004911 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004912 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004913 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
4915 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004916 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004917 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004918 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004919 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004921
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004922 Py_XDECREF(errorHandler);
4923 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004924 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004925 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004926 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004927 Py_XDECREF(errorHandler);
4928 Py_XDECREF(exc);
4929 Py_XDECREF(result);
4930 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004931
Tim Peters602f7402002-04-27 18:03:26 +00004932#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
Alexander Belopolsky40018472011-02-26 01:02:56 +00004935PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4937 Py_ssize_t size,
4938 const char *errors)
4939{
4940 PyObject *v, *unicode;
4941
4942 unicode = PyUnicode_FromUnicode(s, size);
4943 if (unicode == NULL)
4944 return NULL;
4945 v = _PyUnicode_AsUTF8String(unicode, errors);
4946 Py_DECREF(unicode);
4947 return v;
4948}
4949
4950PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004951PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004953 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954}
4955
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956/* --- UTF-32 Codec ------------------------------------------------------- */
4957
4958PyObject *
4959PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 Py_ssize_t size,
4961 const char *errors,
4962 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963{
4964 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4965}
4966
4967PyObject *
4968PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 Py_ssize_t size,
4970 const char *errors,
4971 int *byteorder,
4972 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973{
4974 const char *starts = s;
4975 Py_ssize_t startinpos;
4976 Py_ssize_t endinpos;
4977 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004978 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004979 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 int bo = 0; /* assume native ordering by default */
4981 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 /* Offsets from q for retrieving bytes in the right order. */
4983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4984 int iorder[] = {0, 1, 2, 3};
4985#else
4986 int iorder[] = {3, 2, 1, 0};
4987#endif
4988 PyObject *errorHandler = NULL;
4989 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 q = (unsigned char *)s;
4992 e = q + size;
4993
4994 if (byteorder)
4995 bo = *byteorder;
4996
4997 /* Check for BOM marks (U+FEFF) in the input and adjust current
4998 byte order setting accordingly. In native mode, the leading BOM
4999 mark is skipped, in all other modes, it is copied to the output
5000 stream as-is (giving a ZWNBSP character). */
5001 if (bo == 0) {
5002 if (size >= 4) {
5003 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 if (bom == 0x0000FEFF) {
5007 q += 4;
5008 bo = -1;
5009 }
5010 else if (bom == 0xFFFE0000) {
5011 q += 4;
5012 bo = 1;
5013 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 if (bom == 0x0000FEFF) {
5016 q += 4;
5017 bo = 1;
5018 }
5019 else if (bom == 0xFFFE0000) {
5020 q += 4;
5021 bo = -1;
5022 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025 }
5026
5027 if (bo == -1) {
5028 /* force LE */
5029 iorder[0] = 0;
5030 iorder[1] = 1;
5031 iorder[2] = 2;
5032 iorder[3] = 3;
5033 }
5034 else if (bo == 1) {
5035 /* force BE */
5036 iorder[0] = 3;
5037 iorder[1] = 2;
5038 iorder[2] = 1;
5039 iorder[3] = 0;
5040 }
5041
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005042 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005043 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005044 if (!unicode)
5045 return NULL;
5046 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005047 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005048 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005049
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 Py_UCS4 ch;
5052 /* remaining bytes at the end? (size should be divisible by 4) */
5053 if (e-q<4) {
5054 if (consumed)
5055 break;
5056 errmsg = "truncated data";
5057 startinpos = ((const char *)q)-starts;
5058 endinpos = ((const char *)e)-starts;
5059 goto utf32Error;
5060 /* The remaining input chars are ignored if the callback
5061 chooses to skip the input */
5062 }
5063 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5064 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (ch >= 0x110000)
5067 {
5068 errmsg = "codepoint not in range(0x110000)";
5069 startinpos = ((const char *)q)-starts;
5070 endinpos = startinpos+4;
5071 goto utf32Error;
5072 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005073 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 q += 4;
5076 continue;
5077 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 if (unicode_decode_call_errorhandler(
5079 errors, &errorHandler,
5080 "utf32", errmsg,
5081 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005082 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 }
5085
5086 if (byteorder)
5087 *byteorder = bo;
5088
5089 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091
5092 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005093 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 goto onError;
5095
5096 Py_XDECREF(errorHandler);
5097 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005098 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005101 Py_DECREF(unicode);
5102 Py_XDECREF(errorHandler);
5103 Py_XDECREF(exc);
5104 return NULL;
5105}
5106
5107PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005108_PyUnicode_EncodeUTF32(PyObject *str,
5109 const char *errors,
5110 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005112 int kind;
5113 void *data;
5114 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005115 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005117 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 /* Offsets from p for storing byte pairs in the right order. */
5119#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5120 int iorder[] = {0, 1, 2, 3};
5121#else
5122 int iorder[] = {3, 2, 1, 0};
5123#endif
5124
Benjamin Peterson29060642009-01-31 22:14:21 +00005125#define STORECHAR(CH) \
5126 do { \
5127 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5128 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5129 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5130 p[iorder[0]] = (CH) & 0xff; \
5131 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 } while(0)
5133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (!PyUnicode_Check(str)) {
5135 PyErr_BadArgument();
5136 return NULL;
5137 }
5138 if (PyUnicode_READY(str) < 0)
5139 return NULL;
5140 kind = PyUnicode_KIND(str);
5141 data = PyUnicode_DATA(str);
5142 len = PyUnicode_GET_LENGTH(str);
5143
5144 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005145 bytesize = nsize * 4;
5146 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005148 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005149 if (v == NULL)
5150 return NULL;
5151
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005152 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005153 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005155 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157
5158 if (byteorder == -1) {
5159 /* force LE */
5160 iorder[0] = 0;
5161 iorder[1] = 1;
5162 iorder[2] = 2;
5163 iorder[3] = 3;
5164 }
5165 else if (byteorder == 1) {
5166 /* force BE */
5167 iorder[0] = 3;
5168 iorder[1] = 2;
5169 iorder[2] = 1;
5170 iorder[3] = 0;
5171 }
5172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005173 for (i = 0; i < len; i++)
5174 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005175
5176 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005177 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178#undef STORECHAR
5179}
5180
Alexander Belopolsky40018472011-02-26 01:02:56 +00005181PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005182PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5183 Py_ssize_t size,
5184 const char *errors,
5185 int byteorder)
5186{
5187 PyObject *result;
5188 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5189 if (tmp == NULL)
5190 return NULL;
5191 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5192 Py_DECREF(tmp);
5193 return result;
5194}
5195
5196PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198{
Victor Stinnerb960b342011-11-20 19:12:52 +01005199 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200}
5201
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202/* --- UTF-16 Codec ------------------------------------------------------- */
5203
Tim Peters772747b2001-08-09 22:21:55 +00005204PyObject *
5205PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 Py_ssize_t size,
5207 const char *errors,
5208 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209{
Walter Dörwald69652032004-09-07 20:24:22 +00005210 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5211}
5212
Antoine Pitrouab868312009-01-10 15:40:25 +00005213/* Two masks for fast checking of whether a C 'long' may contain
5214 UTF16-encoded surrogate characters. This is an efficient heuristic,
5215 assuming that non-surrogate characters with a code point >= 0x8000 are
5216 rare in most input.
5217 FAST_CHAR_MASK is used when the input is in native byte ordering,
5218 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005219*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005220#if (SIZEOF_LONG == 8)
5221# define FAST_CHAR_MASK 0x8000800080008000L
5222# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5223#elif (SIZEOF_LONG == 4)
5224# define FAST_CHAR_MASK 0x80008000L
5225# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5226#else
5227# error C 'long' size should be either 4 or 8!
5228#endif
5229
Walter Dörwald69652032004-09-07 20:24:22 +00005230PyObject *
5231PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005236{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
5240 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005241 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005242 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005243 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005244 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005245 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005246 /* Offsets from q for retrieving byte pairs in the right order. */
5247#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5248 int ihi = 1, ilo = 0;
5249#else
5250 int ihi = 0, ilo = 1;
5251#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
5255 /* Note: size will always be longer than the resulting Unicode
5256 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005257 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (!unicode)
5259 return NULL;
5260 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005261 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005262 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Tim Peters772747b2001-08-09 22:21:55 +00005264 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005265 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005268 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005270 /* Check for BOM marks (U+FEFF) in the input and adjust current
5271 byte order setting accordingly. In native mode, the leading BOM
5272 mark is skipped, in all other modes, it is copied to the output
5273 stream as-is (giving a ZWNBSP character). */
5274 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005275 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005276 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005277#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 if (bom == 0xFEFF) {
5279 q += 2;
5280 bo = -1;
5281 }
5282 else if (bom == 0xFFFE) {
5283 q += 2;
5284 bo = 1;
5285 }
Tim Petersced69f82003-09-16 20:30:58 +00005286#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 if (bom == 0xFEFF) {
5288 q += 2;
5289 bo = 1;
5290 }
5291 else if (bom == 0xFFFE) {
5292 q += 2;
5293 bo = -1;
5294 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005295#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
Tim Peters772747b2001-08-09 22:21:55 +00005299 if (bo == -1) {
5300 /* force LE */
5301 ihi = 1;
5302 ilo = 0;
5303 }
5304 else if (bo == 1) {
5305 /* force BE */
5306 ihi = 0;
5307 ilo = 1;
5308 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005309#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5310 native_ordering = ilo < ihi;
5311#else
5312 native_ordering = ilo > ihi;
5313#endif
Tim Peters772747b2001-08-09 22:21:55 +00005314
Antoine Pitrouab868312009-01-10 15:40:25 +00005315 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005316 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005317 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 /* First check for possible aligned read of a C 'long'. Unaligned
5319 reads are more expensive, better to defer to another iteration. */
5320 if (!((size_t) q & LONG_PTR_MASK)) {
5321 /* Fast path for runs of non-surrogate chars. */
5322 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005323 int kind = PyUnicode_KIND(unicode);
5324 void *data = PyUnicode_DATA(unicode);
5325 while (_q < aligned_end) {
5326 unsigned long block = * (unsigned long *) _q;
5327 unsigned short *pblock = (unsigned short*)&block;
5328 Py_UCS4 maxch;
5329 if (native_ordering) {
5330 /* Can use buffer directly */
5331 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005333 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 else {
5335 /* Need to byte-swap */
5336 unsigned char *_p = (unsigned char*)pblock;
5337 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005338 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005339 _p[0] = _q[1];
5340 _p[1] = _q[0];
5341 _p[2] = _q[3];
5342 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005343#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 _p[4] = _q[5];
5345 _p[5] = _q[4];
5346 _p[6] = _q[7];
5347 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005348#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005349 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005350 maxch = Py_MAX(pblock[0], pblock[1]);
5351#if SIZEOF_LONG == 8
5352 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5353#endif
5354 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5355 if (unicode_widen(&unicode, maxch) < 0)
5356 goto onError;
5357 kind = PyUnicode_KIND(unicode);
5358 data = PyUnicode_DATA(unicode);
5359 }
5360 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5361 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5362#if SIZEOF_LONG == 8
5363 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5364 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5365#endif
5366 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005367 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005368 q = _q;
5369 if (q >= e)
5370 break;
5371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005375
5376 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005377 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5378 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 continue;
5380 }
5381
5382 /* UTF-16 code pair: */
5383 if (q > e) {
5384 errmsg = "unexpected end of data";
5385 startinpos = (((const char *)q) - 2) - starts;
5386 endinpos = ((const char *)e) + 1 - starts;
5387 goto utf16Error;
5388 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005389 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5390 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005392 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005393 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005394 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005395 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 continue;
5397 }
5398 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005399 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 startinpos = (((const char *)q)-4)-starts;
5401 endinpos = startinpos+2;
5402 goto utf16Error;
5403 }
5404
Benjamin Peterson14339b62009-01-31 16:36:08 +00005405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 errmsg = "illegal encoding";
5407 startinpos = (((const char *)q)-2)-starts;
5408 endinpos = startinpos+2;
5409 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005410
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005413 errors,
5414 &errorHandler,
5415 "utf16", errmsg,
5416 &starts,
5417 (const char **)&e,
5418 &startinpos,
5419 &endinpos,
5420 &exc,
5421 (const char **)&q,
5422 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005423 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005426 /* remaining byte at the end? (size should be even) */
5427 if (e == q) {
5428 if (!consumed) {
5429 errmsg = "truncated data";
5430 startinpos = ((const char *)q) - starts;
5431 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 if (unicode_decode_call_errorhandler(
5433 errors,
5434 &errorHandler,
5435 "utf16", errmsg,
5436 &starts,
5437 (const char **)&e,
5438 &startinpos,
5439 &endinpos,
5440 &exc,
5441 (const char **)&q,
5442 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005443 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005444 goto onError;
5445 /* The remaining input chars are ignored if the callback
5446 chooses to skip the input */
5447 }
5448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
5450 if (byteorder)
5451 *byteorder = bo;
5452
Walter Dörwald69652032004-09-07 20:24:22 +00005453 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005457 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 goto onError;
5459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005462 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 return NULL;
5469}
5470
Antoine Pitrouab868312009-01-10 15:40:25 +00005471#undef FAST_CHAR_MASK
5472#undef SWAPPED_FAST_CHAR_MASK
5473
Tim Peters772747b2001-08-09 22:21:55 +00005474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475_PyUnicode_EncodeUTF16(PyObject *str,
5476 const char *errors,
5477 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 int kind;
5480 void *data;
5481 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005482 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005483 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005485 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005486 /* Offsets from p for storing byte pairs in the right order. */
5487#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5488 int ihi = 1, ilo = 0;
5489#else
5490 int ihi = 0, ilo = 1;
5491#endif
5492
Benjamin Peterson29060642009-01-31 22:14:21 +00005493#define STORECHAR(CH) \
5494 do { \
5495 p[ihi] = ((CH) >> 8) & 0xff; \
5496 p[ilo] = (CH) & 0xff; \
5497 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005498 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005500 if (!PyUnicode_Check(str)) {
5501 PyErr_BadArgument();
5502 return NULL;
5503 }
5504 if (PyUnicode_READY(str) < 0)
5505 return NULL;
5506 kind = PyUnicode_KIND(str);
5507 data = PyUnicode_DATA(str);
5508 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005509
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005510 pairs = 0;
5511 if (kind == PyUnicode_4BYTE_KIND)
5512 for (i = 0; i < len; i++)
5513 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5514 pairs++;
5515 /* 2 * (len + pairs + (byteorder == 0)) */
5516 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005518 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005519 bytesize = nsize * 2;
5520 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005522 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 if (v == NULL)
5524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005526 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005529 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005531
5532 if (byteorder == -1) {
5533 /* force LE */
5534 ihi = 1;
5535 ilo = 0;
5536 }
5537 else if (byteorder == 1) {
5538 /* force BE */
5539 ihi = 0;
5540 ilo = 1;
5541 }
5542
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 for (i = 0; i < len; i++) {
5544 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5545 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 if (ch >= 0x10000) {
5547 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5548 ch = 0xD800 | ((ch-0x10000) >> 10);
5549 }
Tim Peters772747b2001-08-09 22:21:55 +00005550 STORECHAR(ch);
5551 if (ch2)
5552 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005553 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005554
5555 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005556 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005557#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
Alexander Belopolsky40018472011-02-26 01:02:56 +00005560PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005561PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5562 Py_ssize_t size,
5563 const char *errors,
5564 int byteorder)
5565{
5566 PyObject *result;
5567 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5568 if (tmp == NULL)
5569 return NULL;
5570 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5571 Py_DECREF(tmp);
5572 return result;
5573}
5574
5575PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005576PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005578 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579}
5580
5581/* --- Unicode Escape Codec ----------------------------------------------- */
5582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5584 if all the escapes in the string make it still a valid ASCII string.
5585 Returns -1 if any escapes were found which cause the string to
5586 pop out of ASCII range. Otherwise returns the length of the
5587 required buffer to hold the string.
5588 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005589static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005590length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5591{
5592 const unsigned char *p = (const unsigned char *)s;
5593 const unsigned char *end = p + size;
5594 Py_ssize_t length = 0;
5595
5596 if (size < 0)
5597 return -1;
5598
5599 for (; p < end; ++p) {
5600 if (*p > 127) {
5601 /* Non-ASCII */
5602 return -1;
5603 }
5604 else if (*p != '\\') {
5605 /* Normal character */
5606 ++length;
5607 }
5608 else {
5609 /* Backslash-escape, check next char */
5610 ++p;
5611 /* Escape sequence reaches till end of string or
5612 non-ASCII follow-up. */
5613 if (p >= end || *p > 127)
5614 return -1;
5615 switch (*p) {
5616 case '\n':
5617 /* backslash + \n result in zero characters */
5618 break;
5619 case '\\': case '\'': case '\"':
5620 case 'b': case 'f': case 't':
5621 case 'n': case 'r': case 'v': case 'a':
5622 ++length;
5623 break;
5624 case '0': case '1': case '2': case '3':
5625 case '4': case '5': case '6': case '7':
5626 case 'x': case 'u': case 'U': case 'N':
5627 /* these do not guarantee ASCII characters */
5628 return -1;
5629 default:
5630 /* count the backslash + the other character */
5631 length += 2;
5632 }
5633 }
5634 }
5635 return length;
5636}
5637
Fredrik Lundh06d12682001-01-24 07:59:11 +00005638static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005639
Alexander Belopolsky40018472011-02-26 01:02:56 +00005640PyObject *
5641PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005642 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 Py_ssize_t startinpos;
5647 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005649 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651 char* message;
5652 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 PyObject *errorHandler = NULL;
5654 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659
5660 /* After length_of_escaped_ascii_string() there are two alternatives,
5661 either the string is pure ASCII with named escapes like \n, etc.
5662 and we determined it's exact size (common case)
5663 or it contains \x, \u, ... escape sequences. then we create a
5664 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005665 if (len >= 0) {
5666 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 if (!v)
5668 goto onError;
5669 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 }
5671 else {
5672 /* Escaped strings will always be longer than the resulting
5673 Unicode string, so we start with size here and then reduce the
5674 length after conversion to the true value.
5675 (but if the error callback returns a long replacement string
5676 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 if (!v)
5679 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005681 }
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005684 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (s < end) {
5689 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005690 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005693 /* The only case in which i == ascii_length is a backslash
5694 followed by a newline. */
5695 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Non-escape characters are interpreted as Unicode ordinals */
5698 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 continue;
5702 }
5703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 /* \ - Escapes */
5706 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005707 c = *s++;
5708 if (s > end)
5709 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005710
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005711 /* The only case in which i == ascii_length is a backslash
5712 followed by a newline. */
5713 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005715 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718#define WRITECHAR(ch) \
5719 do { \
5720 if (unicode_putchar(&v, &i, ch) < 0) \
5721 goto onError; \
5722 }while(0)
5723
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005725 case '\\': WRITECHAR('\\'); break;
5726 case '\'': WRITECHAR('\''); break;
5727 case '\"': WRITECHAR('\"'); break;
5728 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005729 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005730 case 'f': WRITECHAR('\014'); break;
5731 case 't': WRITECHAR('\t'); break;
5732 case 'n': WRITECHAR('\n'); break;
5733 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005736 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 case '0': case '1': case '2': case '3':
5741 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005742 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005743 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005744 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005745 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 break;
5750
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* hex escapes */
5752 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 digits = 2;
5755 message = "truncated \\xXX escape";
5756 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005760 digits = 4;
5761 message = "truncated \\uXXXX escape";
5762 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005766 digits = 8;
5767 message = "truncated \\UXXXXXXXX escape";
5768 hexescape:
5769 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 if (s+digits>end) {
5771 endinpos = size;
5772 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 errors, &errorHandler,
5774 "unicodeescape", "end of string in escape sequence",
5775 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005776 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 goto onError;
5778 goto nextByte;
5779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 for (j = 0; j < digits; ++j) {
5781 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005782 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 errors, &errorHandler,
5786 "unicodeescape", message,
5787 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005788 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005790 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005792 }
5793 chr = (chr<<4) & ~0xF;
5794 if (c >= '0' && c <= '9')
5795 chr += c - '0';
5796 else if (c >= 'a' && c <= 'f')
5797 chr += 10 + c - 'a';
5798 else
5799 chr += 10 + c - 'A';
5800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005802 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 /* _decoding_error will have already written into the
5804 target buffer. */
5805 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005806 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 if (chr <= 0x10ffff) {
5809 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005810 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 errors, &errorHandler,
5814 "unicodeescape", "illegal Unicode character",
5815 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005816 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005817 goto onError;
5818 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005819 break;
5820
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 case 'N':
5823 message = "malformed \\N character escape";
5824 if (ucnhash_CAPI == NULL) {
5825 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5827 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005828 if (ucnhash_CAPI == NULL)
5829 goto ucnhashError;
5830 }
5831 if (*s == '{') {
5832 const char *start = s+1;
5833 /* look for the closing brace */
5834 while (*s != '}' && s < end)
5835 s++;
5836 if (s > start && s < end && *s == '}') {
5837 /* found a name. look it up in the unicode database */
5838 message = "unknown Unicode character name";
5839 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005841 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 goto store;
5843 }
5844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 errors, &errorHandler,
5848 "unicodeescape", message,
5849 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005851 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005852 break;
5853
5854 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005855 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 message = "\\ at end of string";
5857 s--;
5858 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 errors, &errorHandler,
5861 "unicodeescape", message,
5862 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005863 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 goto onError;
5865 }
5866 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005867 WRITECHAR('\\');
5868 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877 if (PyUnicode_Resize(&v, i) < 0)
5878 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005881 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005882
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005884 PyErr_SetString(
5885 PyExc_UnicodeError,
5886 "\\N escapes not supported (can't load unicodedata module)"
5887 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005888 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005891 return NULL;
5892
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898}
5899
5900/* Return a Unicode-Escape string version of the Unicode object.
5901
5902 If quotes is true, the string is enclosed in u"" or u'' quotes as
5903 appropriate.
5904
5905*/
5906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 int kind;
5914 void *data;
5915 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Thomas Wouters89f507f2006-12-13 04:49:30 +00005917 /* Initial allocation is based on the longest-possible unichr
5918 escape.
5919
5920 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5921 unichr, so in this case it's the longest unichr escape. In
5922 narrow (UTF-16) builds this is five chars per source unichr
5923 since there are two unichrs in the surrogate pair, so in narrow
5924 (UTF-16) builds it's not the longest unichr escape.
5925
5926 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5927 so in the narrow (UTF-16) build case it's the longest unichr
5928 escape.
5929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
5935 if (PyUnicode_READY(unicode) < 0)
5936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
5940 switch(kind) {
5941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
5973 *p++ = '\\';
5974 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005975 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5982 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005984 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005987 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 *p++ = '\\';
5989 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005990 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5991 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5993 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005995
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005996 /* Map special whitespace to '\t', \n', '\r' */
5997 else if (ch == '\t') {
5998 *p++ = '\\';
5999 *p++ = 't';
6000 }
6001 else if (ch == '\n') {
6002 *p++ = '\\';
6003 *p++ = 'n';
6004 }
6005 else if (ch == '\r') {
6006 *p++ = '\\';
6007 *p++ = 'r';
6008 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006009
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006010 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006011 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006013 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006014 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6015 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006016 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 /* Copy everything else as-is */
6019 else
6020 *p++ = (char) ch;
6021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006023 assert(p - PyBytes_AS_STRING(repr) > 0);
6024 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6025 return NULL;
6026 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}
6028
Alexander Belopolsky40018472011-02-26 01:02:56 +00006029PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033 PyObject *result;
6034 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6035 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 result = PyUnicode_AsUnicodeEscapeString(tmp);
6038 Py_DECREF(tmp);
6039 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
6042/* --- Raw Unicode Escape Codec ------------------------------------------- */
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
6045PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 Py_ssize_t size,
6047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 Py_ssize_t startinpos;
6051 Py_ssize_t endinpos;
6052 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006053 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 /* Escaped strings will always be longer than the resulting
6060 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 length after conversion to the true value. (But decoding error
6062 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006067 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006068 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006078 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6079 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 startinpos = s-starts;
6083
6084 /* \u-escapes are only interpreted iff the number of leading
6085 backslashes if odd */
6086 bs = s;
6087 for (;s < end;) {
6088 if (*s != '\\')
6089 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 }
6093 if (((s - bs) & 1) == 0 ||
6094 s >= end ||
6095 (*s != 'u' && *s != 'U')) {
6096 continue;
6097 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 count = *s=='u' ? 4 : 8;
6100 s++;
6101
6102 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 for (x = 0, i = 0; i < count; ++i, ++s) {
6104 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006105 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 endinpos = s-starts;
6107 if (unicode_decode_call_errorhandler(
6108 errors, &errorHandler,
6109 "rawunicodeescape", "truncated \\uXXXX",
6110 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006111 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 goto onError;
6113 goto nextByte;
6114 }
6115 x = (x<<4) & ~0xF;
6116 if (c >= '0' && c <= '9')
6117 x += c - '0';
6118 else if (c >= 'a' && c <= 'f')
6119 x += 10 + c - 'a';
6120 else
6121 x += 10 + c - 'A';
6122 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006123 if (x <= 0x10ffff) {
6124 if (unicode_putchar(&v, &outpos, x) < 0)
6125 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006126 } else {
6127 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006128 if (unicode_decode_call_errorhandler(
6129 errors, &errorHandler,
6130 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006132 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 nextByte:
6136 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006142 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146 Py_XDECREF(errorHandler);
6147 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 return NULL;
6149}
6150
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151
Alexander Belopolsky40018472011-02-26 01:02:56 +00006152PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006155 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 char *p;
6157 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 Py_ssize_t expandsize, pos;
6159 int kind;
6160 void *data;
6161 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 if (!PyUnicode_Check(unicode)) {
6164 PyErr_BadArgument();
6165 return NULL;
6166 }
6167 if (PyUnicode_READY(unicode) < 0)
6168 return NULL;
6169 kind = PyUnicode_KIND(unicode);
6170 data = PyUnicode_DATA(unicode);
6171 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 switch(kind) {
6174 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6175 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6176 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6177 }
Victor Stinner0e368262011-11-10 20:12:49 +01006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 if (repr == NULL)
6184 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 for (pos = 0; pos < len; pos++) {
6190 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Map 32-bit characters to '\Uxxxxxxxx' */
6192 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006193 *p++ = '\\';
6194 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006195 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6202 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 *p++ = '\\';
6207 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006208 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6211 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* Copy everything else as-is */
6214 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 *p++ = (char) ch;
6216 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006217
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 assert(p > q);
6219 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 return NULL;
6221 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222}
6223
Alexander Belopolsky40018472011-02-26 01:02:56 +00006224PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6226 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 PyObject *result;
6229 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6230 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006231 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006232 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6233 Py_DECREF(tmp);
6234 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006237/* --- Unicode Internal Codec ------------------------------------------- */
6238
Alexander Belopolsky40018472011-02-26 01:02:56 +00006239PyObject *
6240_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006241 Py_ssize_t size,
6242 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006243{
6244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245 Py_ssize_t startinpos;
6246 Py_ssize_t endinpos;
6247 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006248 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 const char *end;
6250 const char *reason;
6251 PyObject *errorHandler = NULL;
6252 PyObject *exc = NULL;
6253
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006255 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006256 1))
6257 return NULL;
6258
Thomas Wouters89f507f2006-12-13 04:49:30 +00006259 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006260 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006263 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006264 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266 end = s + size;
6267
6268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
6271 /* We copy the raw representation one byte at a time because the
6272 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006273 ((char *) &uch)[0] = s[0];
6274 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006275#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006276 ((char *) &uch)[2] = s[2];
6277 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006278#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006279 ch = uch;
6280
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 /* We have to sanity check the raw data, otherwise doom looms for
6282 some malformed UCS-4 data. */
6283 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006284#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006285 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006286#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 end-s < Py_UNICODE_SIZE
6288 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 startinpos = s - starts;
6291 if (end-s < Py_UNICODE_SIZE) {
6292 endinpos = end-starts;
6293 reason = "truncated input";
6294 }
6295 else {
6296 endinpos = s - starts + Py_UNICODE_SIZE;
6297 reason = "illegal code point (> 0x10FFFF)";
6298 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006299 if (unicode_decode_call_errorhandler(
6300 errors, &errorHandler,
6301 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006302 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006303 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006304 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006305 continue;
6306 }
6307
6308 s += Py_UNICODE_SIZE;
6309#ifndef Py_UNICODE_WIDE
6310 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6311 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006312 Py_UNICODE uch2;
6313 ((char *) &uch2)[0] = s[0];
6314 ((char *) &uch2)[1] = s[1];
6315 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006316 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006317 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006318 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 }
6320 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006321#endif
6322
6323 if (unicode_putchar(&v, &outpos, ch) < 0)
6324 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 }
6326
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006327 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328 goto onError;
6329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006331 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006334 Py_XDECREF(v);
6335 Py_XDECREF(errorHandler);
6336 Py_XDECREF(exc);
6337 return NULL;
6338}
6339
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340/* --- Latin-1 Codec ------------------------------------------------------ */
6341
Alexander Belopolsky40018472011-02-26 01:02:56 +00006342PyObject *
6343PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006344 Py_ssize_t size,
6345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006348 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349}
6350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352static void
6353make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006355 PyObject *unicode,
6356 Py_ssize_t startpos, Py_ssize_t endpos,
6357 const char *reason)
6358{
6359 if (*exceptionObject == NULL) {
6360 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006362 encoding, unicode, startpos, endpos, reason);
6363 }
6364 else {
6365 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6366 goto onError;
6367 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6368 goto onError;
6369 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6370 goto onError;
6371 return;
6372 onError:
6373 Py_DECREF(*exceptionObject);
6374 *exceptionObject = NULL;
6375 }
6376}
6377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379static void
6380raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006381 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006382 PyObject *unicode,
6383 Py_ssize_t startpos, Py_ssize_t endpos,
6384 const char *reason)
6385{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006386 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006387 encoding, unicode, startpos, endpos, reason);
6388 if (*exceptionObject != NULL)
6389 PyCodec_StrictErrors(*exceptionObject);
6390}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391
6392/* error handling callback helper:
6393 build arguments, call the callback and check the arguments,
6394 put the result into newpos and return the replacement string, which
6395 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396static PyObject *
6397unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006398 PyObject **errorHandler,
6399 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006401 Py_ssize_t startpos, Py_ssize_t endpos,
6402 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006404 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 PyObject *restuple;
6407 PyObject *resunicode;
6408
6409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 }
6414
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 if (PyUnicode_READY(unicode) < 0)
6416 return NULL;
6417 len = PyUnicode_GET_LENGTH(unicode);
6418
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006419 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423
6424 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 Py_DECREF(restuple);
6431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 &resunicode, newpos)) {
6435 Py_DECREF(restuple);
6436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006438 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6440 Py_DECREF(restuple);
6441 return NULL;
6442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 *newpos = len + *newpos;
6445 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6447 Py_DECREF(restuple);
6448 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 Py_INCREF(resunicode);
6451 Py_DECREF(restuple);
6452 return resunicode;
6453}
6454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006458 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006460 /* input state */
6461 Py_ssize_t pos=0, size;
6462 int kind;
6463 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 /* output object */
6465 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 /* pointer into the output */
6467 char *str;
6468 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006469 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006470 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6471 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472 PyObject *errorHandler = NULL;
6473 PyObject *exc = NULL;
6474 /* the following variable is used for caching string comparisons
6475 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6476 int known_errorHandler = -1;
6477
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006478 if (PyUnicode_READY(unicode) < 0)
6479 return NULL;
6480 size = PyUnicode_GET_LENGTH(unicode);
6481 kind = PyUnicode_KIND(unicode);
6482 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 /* allocate enough for a simple encoding without
6484 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006485 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006486 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006487 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006489 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006490 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 ressize = size;
6492
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 while (pos < size) {
6494 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 /* can we encode this? */
6497 if (c<limit) {
6498 /* no overflow check, because we know that the space is enough */
6499 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 Py_ssize_t requiredsize;
6504 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 Py_ssize_t collstart = pos;
6508 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 ++collend;
6512 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6513 if (known_errorHandler==-1) {
6514 if ((errors==NULL) || (!strcmp(errors, "strict")))
6515 known_errorHandler = 1;
6516 else if (!strcmp(errors, "replace"))
6517 known_errorHandler = 2;
6518 else if (!strcmp(errors, "ignore"))
6519 known_errorHandler = 3;
6520 else if (!strcmp(errors, "xmlcharrefreplace"))
6521 known_errorHandler = 4;
6522 else
6523 known_errorHandler = 0;
6524 }
6525 switch (known_errorHandler) {
6526 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006527 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
6529 case 2: /* replace */
6530 while (collstart++<collend)
6531 *str++ = '?'; /* fall through */
6532 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 break;
6535 case 4: /* xmlcharrefreplace */
6536 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 /* determine replacement size */
6538 for (i = collstart, repsize = 0; i < collend; ++i) {
6539 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6540 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006542 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006548#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 else
6550 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006551#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 repsize += 2+6+1;
6556 else
6557 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006558#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 if (requiredsize > ressize) {
6562 if (requiredsize<2*ressize)
6563 requiredsize = 2*ressize;
6564 if (_PyBytes_Resize(&res, requiredsize))
6565 goto onError;
6566 str = PyBytes_AS_STRING(res) + respos;
6567 ressize = requiredsize;
6568 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 /* generate replacement */
6570 for (i = collstart; i < collend; ++i) {
6571 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 break;
6575 default:
6576 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 encoding, reason, unicode, &exc,
6578 collstart, collend, &newpos);
6579 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6580 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 if (PyBytes_Check(repunicode)) {
6583 /* Directly copy bytes result to output. */
6584 repsize = PyBytes_Size(repunicode);
6585 if (repsize > 1) {
6586 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006587 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6589 Py_DECREF(repunicode);
6590 goto onError;
6591 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006592 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006593 ressize += repsize-1;
6594 }
6595 memcpy(str, PyBytes_AsString(repunicode), repsize);
6596 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006598 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006599 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 /* need more space? (at least enough for what we
6602 have+the replacement+the rest of the string, so
6603 we won't have to check space for encodable characters) */
6604 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 repsize = PyUnicode_GET_LENGTH(repunicode);
6606 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (requiredsize > ressize) {
6608 if (requiredsize<2*ressize)
6609 requiredsize = 2*ressize;
6610 if (_PyBytes_Resize(&res, requiredsize)) {
6611 Py_DECREF(repunicode);
6612 goto onError;
6613 }
6614 str = PyBytes_AS_STRING(res) + respos;
6615 ressize = requiredsize;
6616 }
6617 /* check if there is anything unencodable in the replacement
6618 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 for (i = 0; repsize-->0; ++i, ++str) {
6620 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006622 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 Py_DECREF(repunicode);
6625 goto onError;
6626 }
6627 *str = (char)c;
6628 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006630 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006632 }
6633 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 /* Resize if we allocated to much */
6635 size = str - PyBytes_AS_STRING(res);
6636 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006637 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006638 if (_PyBytes_Resize(&res, size) < 0)
6639 goto onError;
6640 }
6641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642 Py_XDECREF(errorHandler);
6643 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006644 return res;
6645
6646 onError:
6647 Py_XDECREF(res);
6648 Py_XDECREF(errorHandler);
6649 Py_XDECREF(exc);
6650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651}
6652
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *result;
6660 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6661 if (unicode == NULL)
6662 return NULL;
6663 result = unicode_encode_ucs1(unicode, errors, 256);
6664 Py_DECREF(unicode);
6665 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
6671 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 PyErr_BadArgument();
6673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675 if (PyUnicode_READY(unicode) == -1)
6676 return NULL;
6677 /* Fast path: if it is a one-byte string, construct
6678 bytes object directly. */
6679 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6680 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6681 PyUnicode_GET_LENGTH(unicode));
6682 /* Non-Latin-1 characters present. Defer to above function to
6683 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006685}
6686
6687PyObject*
6688PyUnicode_AsLatin1String(PyObject *unicode)
6689{
6690 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
6693/* --- 7-bit ASCII Codec -------------------------------------------------- */
6694
Alexander Belopolsky40018472011-02-26 01:02:56 +00006695PyObject *
6696PyUnicode_DecodeASCII(const char *s,
6697 Py_ssize_t size,
6698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006701 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006702 int kind;
6703 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006704 Py_ssize_t startinpos;
6705 Py_ssize_t endinpos;
6706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006708 int has_error;
6709 const unsigned char *p = (const unsigned char *)s;
6710 const unsigned char *end = p + size;
6711 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 PyObject *errorHandler = NULL;
6713 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006715 if (size == 0) {
6716 Py_INCREF(unicode_empty);
6717 return unicode_empty;
6718 }
6719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006721 if (size == 1 && (unsigned char)s[0] < 128)
6722 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723
Victor Stinner702c7342011-10-05 13:50:52 +02006724 has_error = 0;
6725 while (p < end && !has_error) {
6726 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6727 an explanation. */
6728 if (!((size_t) p & LONG_PTR_MASK)) {
6729 /* Help register allocation */
6730 register const unsigned char *_p = p;
6731 while (_p < aligned_end) {
6732 unsigned long value = *(unsigned long *) _p;
6733 if (value & ASCII_CHAR_MASK) {
6734 has_error = 1;
6735 break;
6736 }
6737 _p += SIZEOF_LONG;
6738 }
6739 if (_p == end)
6740 break;
6741 if (has_error)
6742 break;
6743 p = _p;
6744 }
6745 if (*p & 0x80) {
6746 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006748 }
6749 else {
6750 ++p;
6751 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006752 }
Victor Stinner702c7342011-10-05 13:50:52 +02006753 if (!has_error)
6754 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006755
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006756 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006760 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006761 kind = PyUnicode_KIND(v);
6762 data = PyUnicode_DATA(v);
6763 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764 e = s + size;
6765 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 register unsigned char c = (unsigned char)*s;
6767 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++s;
6770 }
6771 else {
6772 startinpos = s-starts;
6773 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 if (unicode_decode_call_errorhandler(
6775 errors, &errorHandler,
6776 "ascii", "ordinal not in range(128)",
6777 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006778 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006780 kind = PyUnicode_KIND(v);
6781 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006784 if (PyUnicode_Resize(&v, outpos) < 0)
6785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 Py_XDECREF(errorHandler);
6787 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006788 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006789 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006790
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793 Py_XDECREF(errorHandler);
6794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 return NULL;
6796}
6797
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799PyObject *
6800PyUnicode_EncodeASCII(const Py_UNICODE *p,
6801 Py_ssize_t size,
6802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 PyObject *result;
6805 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6806 if (unicode == NULL)
6807 return NULL;
6808 result = unicode_encode_ucs1(unicode, errors, 128);
6809 Py_DECREF(unicode);
6810 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Alexander Belopolsky40018472011-02-26 01:02:56 +00006813PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006814_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
6816 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 PyErr_BadArgument();
6818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820 if (PyUnicode_READY(unicode) == -1)
6821 return NULL;
6822 /* Fast path: if it is an ASCII-only string, construct bytes object
6823 directly. Else defer to above function to raise the exception. */
6824 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6825 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6826 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828}
6829
6830PyObject *
6831PyUnicode_AsASCIIString(PyObject *unicode)
6832{
6833 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Victor Stinner99b95382011-07-04 14:23:54 +02006836#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006839
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006840#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841#define NEED_RETRY
6842#endif
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844#ifndef WC_ERR_INVALID_CHARS
6845# define WC_ERR_INVALID_CHARS 0x0080
6846#endif
6847
6848static char*
6849code_page_name(UINT code_page, PyObject **obj)
6850{
6851 *obj = NULL;
6852 if (code_page == CP_ACP)
6853 return "mbcs";
6854 if (code_page == CP_UTF7)
6855 return "CP_UTF7";
6856 if (code_page == CP_UTF8)
6857 return "CP_UTF8";
6858
6859 *obj = PyBytes_FromFormat("cp%u", code_page);
6860 if (*obj == NULL)
6861 return NULL;
6862 return PyBytes_AS_STRING(*obj);
6863}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006866is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867{
6868 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 if (!IsDBCSLeadByteEx(code_page, *curr))
6872 return 0;
6873
6874 prev = CharPrevExA(code_page, s, curr, 0);
6875 if (prev == curr)
6876 return 1;
6877 /* FIXME: This code is limited to "true" double-byte encodings,
6878 as it assumes an incomplete character consists of a single
6879 byte. */
6880 if (curr - prev == 2)
6881 return 1;
6882 if (!IsDBCSLeadByteEx(code_page, *prev))
6883 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884 return 0;
6885}
6886
Victor Stinner3a50e702011-10-18 21:21:00 +02006887static DWORD
6888decode_code_page_flags(UINT code_page)
6889{
6890 if (code_page == CP_UTF7) {
6891 /* The CP_UTF7 decoder only supports flags=0 */
6892 return 0;
6893 }
6894 else
6895 return MB_ERR_INVALID_CHARS;
6896}
6897
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 * Decode a byte string from a Windows code page into unicode object in strict
6900 * mode.
6901 *
6902 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6903 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006905static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006906decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006907 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 const char *in,
6909 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910{
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006912 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006914
6915 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 assert(insize > 0);
6917 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6918 if (outsize <= 0)
6919 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920
6921 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006923 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 if (*v == NULL)
6925 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927 }
6928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006931 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 }
6935
6936 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6938 if (outsize <= 0)
6939 goto error;
6940 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006941
Victor Stinner3a50e702011-10-18 21:21:00 +02006942error:
6943 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6944 return -2;
6945 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006946 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947}
6948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949/*
6950 * Decode a byte string from a code page into unicode object with an error
6951 * handler.
6952 *
6953 * Returns consumed size if succeed, or raise a WindowsError or
6954 * UnicodeDecodeError exception and returns -1 on error.
6955 */
6956static int
6957decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006958 PyObject **v,
6959 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 const char *errors)
6961{
6962 const char *startin = in;
6963 const char *endin = in + size;
6964 const DWORD flags = decode_code_page_flags(code_page);
6965 /* Ideally, we should get reason from FormatMessage. This is the Windows
6966 2000 English version of the message. */
6967 const char *reason = "No mapping for the Unicode character exists "
6968 "in the target code page.";
6969 /* each step cannot decode more than 1 character, but a character can be
6970 represented as a surrogate pair */
6971 wchar_t buffer[2], *startout, *out;
6972 int insize, outsize;
6973 PyObject *errorHandler = NULL;
6974 PyObject *exc = NULL;
6975 PyObject *encoding_obj = NULL;
6976 char *encoding;
6977 DWORD err;
6978 int ret = -1;
6979
6980 assert(size > 0);
6981
6982 encoding = code_page_name(code_page, &encoding_obj);
6983 if (encoding == NULL)
6984 return -1;
6985
6986 if (errors == NULL || strcmp(errors, "strict") == 0) {
6987 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6988 UnicodeDecodeError. */
6989 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6990 if (exc != NULL) {
6991 PyCodec_StrictErrors(exc);
6992 Py_CLEAR(exc);
6993 }
6994 goto error;
6995 }
6996
6997 if (*v == NULL) {
6998 /* Create unicode object */
6999 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7000 PyErr_NoMemory();
7001 goto error;
7002 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 if (*v == NULL)
7005 goto error;
7006 startout = PyUnicode_AS_UNICODE(*v);
7007 }
7008 else {
7009 /* Extend unicode object */
7010 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7011 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7012 PyErr_NoMemory();
7013 goto error;
7014 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007015 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 goto error;
7017 startout = PyUnicode_AS_UNICODE(*v) + n;
7018 }
7019
7020 /* Decode the byte string character per character */
7021 out = startout;
7022 while (in < endin)
7023 {
7024 /* Decode a character */
7025 insize = 1;
7026 do
7027 {
7028 outsize = MultiByteToWideChar(code_page, flags,
7029 in, insize,
7030 buffer, Py_ARRAY_LENGTH(buffer));
7031 if (outsize > 0)
7032 break;
7033 err = GetLastError();
7034 if (err != ERROR_NO_UNICODE_TRANSLATION
7035 && err != ERROR_INSUFFICIENT_BUFFER)
7036 {
7037 PyErr_SetFromWindowsErr(0);
7038 goto error;
7039 }
7040 insize++;
7041 }
7042 /* 4=maximum length of a UTF-8 sequence */
7043 while (insize <= 4 && (in + insize) <= endin);
7044
7045 if (outsize <= 0) {
7046 Py_ssize_t startinpos, endinpos, outpos;
7047
7048 startinpos = in - startin;
7049 endinpos = startinpos + 1;
7050 outpos = out - PyUnicode_AS_UNICODE(*v);
7051 if (unicode_decode_call_errorhandler(
7052 errors, &errorHandler,
7053 encoding, reason,
7054 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007055 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 {
7057 goto error;
7058 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007059 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 }
7061 else {
7062 in += insize;
7063 memcpy(out, buffer, outsize * sizeof(wchar_t));
7064 out += outsize;
7065 }
7066 }
7067
7068 /* write a NUL character at the end */
7069 *out = 0;
7070
7071 /* Extend unicode object */
7072 outsize = out - startout;
7073 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007076 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007077
7078error:
7079 Py_XDECREF(encoding_obj);
7080 Py_XDECREF(errorHandler);
7081 Py_XDECREF(exc);
7082 return ret;
7083}
7084
Victor Stinner3a50e702011-10-18 21:21:00 +02007085static PyObject *
7086decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 const char *s, Py_ssize_t size,
7088 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089{
Victor Stinner76a31a62011-11-04 00:05:13 +01007090 PyObject *v = NULL;
7091 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 if (code_page < 0) {
7094 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7095 return NULL;
7096 }
7097
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 do
7102 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 if (size > INT_MAX) {
7105 chunk_size = INT_MAX;
7106 final = 0;
7107 done = 0;
7108 }
7109 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 {
7112 chunk_size = (int)size;
7113 final = (consumed == NULL);
7114 done = 1;
7115 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 /* Skip trailing lead-byte unless 'final' is set */
7118 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7119 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120
Victor Stinner76a31a62011-11-04 00:05:13 +01007121 if (chunk_size == 0 && done) {
7122 if (v != NULL)
7123 break;
7124 Py_INCREF(unicode_empty);
7125 return unicode_empty;
7126 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127
Victor Stinner76a31a62011-11-04 00:05:13 +01007128
7129 converted = decode_code_page_strict(code_page, &v,
7130 s, chunk_size);
7131 if (converted == -2)
7132 converted = decode_code_page_errors(code_page, &v,
7133 s, chunk_size,
7134 errors);
7135 assert(converted != 0);
7136
7137 if (converted < 0) {
7138 Py_XDECREF(v);
7139 return NULL;
7140 }
7141
7142 if (consumed)
7143 *consumed += converted;
7144
7145 s += converted;
7146 size -= converted;
7147 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007148
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007149 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150}
7151
Alexander Belopolsky40018472011-02-26 01:02:56 +00007152PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007153PyUnicode_DecodeCodePageStateful(int code_page,
7154 const char *s,
7155 Py_ssize_t size,
7156 const char *errors,
7157 Py_ssize_t *consumed)
7158{
7159 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7160}
7161
7162PyObject *
7163PyUnicode_DecodeMBCSStateful(const char *s,
7164 Py_ssize_t size,
7165 const char *errors,
7166 Py_ssize_t *consumed)
7167{
7168 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7169}
7170
7171PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007172PyUnicode_DecodeMBCS(const char *s,
7173 Py_ssize_t size,
7174 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7177}
7178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179static DWORD
7180encode_code_page_flags(UINT code_page, const char *errors)
7181{
7182 if (code_page == CP_UTF8) {
7183 if (winver.dwMajorVersion >= 6)
7184 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7185 and later */
7186 return WC_ERR_INVALID_CHARS;
7187 else
7188 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7189 return 0;
7190 }
7191 else if (code_page == CP_UTF7) {
7192 /* CP_UTF7 only supports flags=0 */
7193 return 0;
7194 }
7195 else {
7196 if (errors != NULL && strcmp(errors, "replace") == 0)
7197 return 0;
7198 else
7199 return WC_NO_BEST_FIT_CHARS;
7200 }
7201}
7202
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 * Encode a Unicode string to a Windows code page into a byte string in strict
7205 * mode.
7206 *
7207 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7208 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007211encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214{
Victor Stinner554f3f02010-06-16 23:33:54 +00007215 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 BOOL *pusedDefaultChar = &usedDefaultChar;
7217 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007218 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007219 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 const DWORD flags = encode_code_page_flags(code_page, NULL);
7222 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007223 /* Create a substring so that we can get the UTF-16 representation
7224 of just the slice under consideration. */
7225 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Martin v. Löwis3d325192011-11-04 18:23:06 +01007227 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007228
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007230 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007233
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 substring = PyUnicode_Substring(unicode, offset, offset+len);
7235 if (substring == NULL)
7236 return -1;
7237 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7238 if (p == NULL) {
7239 Py_DECREF(substring);
7240 return -1;
7241 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007242
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007243 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 outsize = WideCharToMultiByte(code_page, flags,
7245 p, size,
7246 NULL, 0,
7247 NULL, pusedDefaultChar);
7248 if (outsize <= 0)
7249 goto error;
7250 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 if (pusedDefaultChar && *pusedDefaultChar) {
7252 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007254 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007255
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 if (*outbytes == NULL) {
7260 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007262 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264 }
7265 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 const Py_ssize_t n = PyBytes_Size(*outbytes);
7268 if (outsize > PY_SSIZE_T_MAX - n) {
7269 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7274 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007276 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 }
7279
7280 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 outsize = WideCharToMultiByte(code_page, flags,
7282 p, size,
7283 out, outsize,
7284 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 if (outsize <= 0)
7287 goto error;
7288 if (pusedDefaultChar && *pusedDefaultChar)
7289 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007291
Victor Stinner3a50e702011-10-18 21:21:00 +02007292error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007293 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7295 return -2;
7296 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007297 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007298}
7299
Victor Stinner3a50e702011-10-18 21:21:00 +02007300/*
7301 * Encode a Unicode string to a Windows code page into a byte string using a
7302 * error handler.
7303 *
7304 * Returns consumed characters if succeed, or raise a WindowsError and returns
7305 * -1 on other error.
7306 */
7307static int
7308encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007309 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007311{
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 Py_ssize_t pos = unicode_offset;
7314 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 /* Ideally, we should get reason from FormatMessage. This is the Windows
7316 2000 English version of the message. */
7317 const char *reason = "invalid character";
7318 /* 4=maximum length of a UTF-8 sequence */
7319 char buffer[4];
7320 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7321 Py_ssize_t outsize;
7322 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 PyObject *errorHandler = NULL;
7324 PyObject *exc = NULL;
7325 PyObject *encoding_obj = NULL;
7326 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 PyObject *rep;
7329 int ret = -1;
7330
7331 assert(insize > 0);
7332
7333 encoding = code_page_name(code_page, &encoding_obj);
7334 if (encoding == NULL)
7335 return -1;
7336
7337 if (errors == NULL || strcmp(errors, "strict") == 0) {
7338 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7339 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007340 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (exc != NULL) {
7342 PyCodec_StrictErrors(exc);
7343 Py_DECREF(exc);
7344 }
7345 Py_XDECREF(encoding_obj);
7346 return -1;
7347 }
7348
7349 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7350 pusedDefaultChar = &usedDefaultChar;
7351 else
7352 pusedDefaultChar = NULL;
7353
7354 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7355 PyErr_NoMemory();
7356 goto error;
7357 }
7358 outsize = insize * Py_ARRAY_LENGTH(buffer);
7359
7360 if (*outbytes == NULL) {
7361 /* Create string object */
7362 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7363 if (*outbytes == NULL)
7364 goto error;
7365 out = PyBytes_AS_STRING(*outbytes);
7366 }
7367 else {
7368 /* Extend string object */
7369 Py_ssize_t n = PyBytes_Size(*outbytes);
7370 if (n > PY_SSIZE_T_MAX - outsize) {
7371 PyErr_NoMemory();
7372 goto error;
7373 }
7374 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7375 goto error;
7376 out = PyBytes_AS_STRING(*outbytes) + n;
7377 }
7378
7379 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007380 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7383 wchar_t chars[2];
7384 int charsize;
7385 if (ch < 0x10000) {
7386 chars[0] = (wchar_t)ch;
7387 charsize = 1;
7388 }
7389 else {
7390 ch -= 0x10000;
7391 chars[0] = 0xd800 + (ch >> 10);
7392 chars[1] = 0xdc00 + (ch & 0x3ff);
7393 charsize = 2;
7394 }
7395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 buffer, Py_ARRAY_LENGTH(buffer),
7399 NULL, pusedDefaultChar);
7400 if (outsize > 0) {
7401 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7402 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007403 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 memcpy(out, buffer, outsize);
7405 out += outsize;
7406 continue;
7407 }
7408 }
7409 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7410 PyErr_SetFromWindowsErr(0);
7411 goto error;
7412 }
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 rep = unicode_encode_call_errorhandler(
7415 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007416 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (rep == NULL)
7419 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007421
7422 if (PyBytes_Check(rep)) {
7423 outsize = PyBytes_GET_SIZE(rep);
7424 if (outsize != 1) {
7425 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7426 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7427 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7428 Py_DECREF(rep);
7429 goto error;
7430 }
7431 out = PyBytes_AS_STRING(*outbytes) + offset;
7432 }
7433 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7434 out += outsize;
7435 }
7436 else {
7437 Py_ssize_t i;
7438 enum PyUnicode_Kind kind;
7439 void *data;
7440
7441 if (PyUnicode_READY(rep) < 0) {
7442 Py_DECREF(rep);
7443 goto error;
7444 }
7445
7446 outsize = PyUnicode_GET_LENGTH(rep);
7447 if (outsize != 1) {
7448 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7449 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7450 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7451 Py_DECREF(rep);
7452 goto error;
7453 }
7454 out = PyBytes_AS_STRING(*outbytes) + offset;
7455 }
7456 kind = PyUnicode_KIND(rep);
7457 data = PyUnicode_DATA(rep);
7458 for (i=0; i < outsize; i++) {
7459 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7460 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007461 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 encoding, unicode,
7463 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 "unable to encode error handler result to ASCII");
7465 Py_DECREF(rep);
7466 goto error;
7467 }
7468 *out = (unsigned char)ch;
7469 out++;
7470 }
7471 }
7472 Py_DECREF(rep);
7473 }
7474 /* write a NUL byte */
7475 *out = 0;
7476 outsize = out - PyBytes_AS_STRING(*outbytes);
7477 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7478 if (_PyBytes_Resize(outbytes, outsize) < 0)
7479 goto error;
7480 ret = 0;
7481
7482error:
7483 Py_XDECREF(encoding_obj);
7484 Py_XDECREF(errorHandler);
7485 Py_XDECREF(exc);
7486 return ret;
7487}
7488
Victor Stinner3a50e702011-10-18 21:21:00 +02007489static PyObject *
7490encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007491 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 const char *errors)
7493{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007496 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007498
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 if (PyUnicode_READY(unicode) < 0)
7500 return NULL;
7501 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (code_page < 0) {
7504 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7505 return NULL;
7506 }
7507
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 return PyBytes_FromStringAndSize(NULL, 0);
7510
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 offset = 0;
7512 do
7513 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 chunks. */
7517 if (len > INT_MAX/2) {
7518 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 done = 0;
7520 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007521 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007523 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007524 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007525 done = 1;
7526 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007527
Victor Stinner76a31a62011-11-04 00:05:13 +01007528 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007530 errors);
7531 if (ret == -2)
7532 ret = encode_code_page_errors(code_page, &outbytes,
7533 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007534 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007535 if (ret < 0) {
7536 Py_XDECREF(outbytes);
7537 return NULL;
7538 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007539
Victor Stinner7581cef2011-11-03 22:32:33 +01007540 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007541 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007542 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007543
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 return outbytes;
7545}
7546
7547PyObject *
7548PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7549 Py_ssize_t size,
7550 const char *errors)
7551{
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 PyObject *unicode, *res;
7553 unicode = PyUnicode_FromUnicode(p, size);
7554 if (unicode == NULL)
7555 return NULL;
7556 res = encode_code_page(CP_ACP, unicode, errors);
7557 Py_DECREF(unicode);
7558 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007559}
7560
7561PyObject *
7562PyUnicode_EncodeCodePage(int code_page,
7563 PyObject *unicode,
7564 const char *errors)
7565{
Victor Stinner7581cef2011-11-03 22:32:33 +01007566 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007567}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007568
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569PyObject *
7570PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007571{
7572 if (!PyUnicode_Check(unicode)) {
7573 PyErr_BadArgument();
7574 return NULL;
7575 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007576 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007577}
7578
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579#undef NEED_RETRY
7580
Victor Stinner99b95382011-07-04 14:23:54 +02007581#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007582
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583/* --- Character Mapping Codec -------------------------------------------- */
7584
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585PyObject *
7586PyUnicode_DecodeCharmap(const char *s,
7587 Py_ssize_t size,
7588 PyObject *mapping,
7589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592 Py_ssize_t startinpos;
7593 Py_ssize_t endinpos;
7594 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007596 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 PyObject *errorHandler = NULL;
7599 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007600
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 /* Default to Latin-1 */
7602 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007605 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007609 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007610 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007612 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007613 Py_ssize_t maplen;
7614 enum PyUnicode_Kind kind;
7615 void *data;
7616 Py_UCS4 x;
7617
7618 if (PyUnicode_READY(mapping) < 0)
7619 return NULL;
7620
7621 maplen = PyUnicode_GET_LENGTH(mapping);
7622 data = PyUnicode_DATA(mapping);
7623 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 while (s < e) {
7625 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007628 x = PyUnicode_READ(kind, data, ch);
7629 else
7630 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007632 if (x == 0xfffe)
7633 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 startinpos = s-starts;
7636 endinpos = startinpos+1;
7637 if (unicode_decode_call_errorhandler(
7638 errors, &errorHandler,
7639 "charmap", "character maps to <undefined>",
7640 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007641 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 goto onError;
7643 }
7644 continue;
7645 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007647 if (unicode_putchar(&v, &outpos, x) < 0)
7648 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007651 }
7652 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 while (s < e) {
7654 unsigned char ch = *s;
7655 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007656
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 w = PyLong_FromLong((long)ch);
7659 if (w == NULL)
7660 goto onError;
7661 x = PyObject_GetItem(mapping, w);
7662 Py_DECREF(w);
7663 if (x == NULL) {
7664 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7665 /* No mapping found means: mapping is undefined. */
7666 PyErr_Clear();
7667 x = Py_None;
7668 Py_INCREF(x);
7669 } else
7670 goto onError;
7671 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007672
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 /* Apply mapping */
7674 if (PyLong_Check(x)) {
7675 long value = PyLong_AS_LONG(x);
7676 if (value < 0 || value > 65535) {
7677 PyErr_SetString(PyExc_TypeError,
7678 "character mapping must be in range(65536)");
7679 Py_DECREF(x);
7680 goto onError;
7681 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007682 if (unicode_putchar(&v, &outpos, value) < 0)
7683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 }
7685 else if (x == Py_None) {
7686 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 startinpos = s-starts;
7688 endinpos = startinpos+1;
7689 if (unicode_decode_call_errorhandler(
7690 errors, &errorHandler,
7691 "charmap", "character maps to <undefined>",
7692 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007693 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 Py_DECREF(x);
7695 goto onError;
7696 }
7697 Py_DECREF(x);
7698 continue;
7699 }
7700 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007701 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007702
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 if (PyUnicode_READY(x) < 0)
7704 goto onError;
7705 targetsize = PyUnicode_GET_LENGTH(x);
7706
7707 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007709 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 PyUnicode_READ_CHAR(x, 0)) < 0)
7711 goto onError;
7712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 else if (targetsize > 1) {
7714 /* 1-n mapping */
7715 if (targetsize > extrachars) {
7716 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 Py_ssize_t needed = (targetsize - extrachars) + \
7718 (targetsize << 2);
7719 extrachars += needed;
7720 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007721 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 Py_DECREF(x);
7724 goto onError;
7725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007727 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7728 goto onError;
7729 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7730 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 extrachars -= targetsize;
7732 }
7733 /* 1-0 mapping: skip the character */
7734 }
7735 else {
7736 /* wrong return value */
7737 PyErr_SetString(PyExc_TypeError,
7738 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 Py_DECREF(x);
7740 goto onError;
7741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 Py_DECREF(x);
7743 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007746 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007747 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007748 Py_XDECREF(errorHandler);
7749 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007750 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007751
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 Py_XDECREF(errorHandler);
7754 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 Py_XDECREF(v);
7756 return NULL;
7757}
7758
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759/* Charmap encoding: the lookup table */
7760
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 PyObject_HEAD
7763 unsigned char level1[32];
7764 int count2, count3;
7765 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766};
7767
7768static PyObject*
7769encoding_map_size(PyObject *obj, PyObject* args)
7770{
7771 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774}
7775
7776static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 PyDoc_STR("Return the size (in bytes) of this object") },
7779 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007780};
7781
7782static void
7783encoding_map_dealloc(PyObject* o)
7784{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786}
7787
7788static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 "EncodingMap", /*tp_name*/
7791 sizeof(struct encoding_map), /*tp_basicsize*/
7792 0, /*tp_itemsize*/
7793 /* methods */
7794 encoding_map_dealloc, /*tp_dealloc*/
7795 0, /*tp_print*/
7796 0, /*tp_getattr*/
7797 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007798 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 0, /*tp_repr*/
7800 0, /*tp_as_number*/
7801 0, /*tp_as_sequence*/
7802 0, /*tp_as_mapping*/
7803 0, /*tp_hash*/
7804 0, /*tp_call*/
7805 0, /*tp_str*/
7806 0, /*tp_getattro*/
7807 0, /*tp_setattro*/
7808 0, /*tp_as_buffer*/
7809 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7810 0, /*tp_doc*/
7811 0, /*tp_traverse*/
7812 0, /*tp_clear*/
7813 0, /*tp_richcompare*/
7814 0, /*tp_weaklistoffset*/
7815 0, /*tp_iter*/
7816 0, /*tp_iternext*/
7817 encoding_map_methods, /*tp_methods*/
7818 0, /*tp_members*/
7819 0, /*tp_getset*/
7820 0, /*tp_base*/
7821 0, /*tp_dict*/
7822 0, /*tp_descr_get*/
7823 0, /*tp_descr_set*/
7824 0, /*tp_dictoffset*/
7825 0, /*tp_init*/
7826 0, /*tp_alloc*/
7827 0, /*tp_new*/
7828 0, /*tp_free*/
7829 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830};
7831
7832PyObject*
7833PyUnicode_BuildEncodingMap(PyObject* string)
7834{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835 PyObject *result;
7836 struct encoding_map *mresult;
7837 int i;
7838 int need_dict = 0;
7839 unsigned char level1[32];
7840 unsigned char level2[512];
7841 unsigned char *mlevel1, *mlevel2, *mlevel3;
7842 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 int kind;
7844 void *data;
7845 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 PyErr_BadArgument();
7849 return NULL;
7850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 kind = PyUnicode_KIND(string);
7852 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 memset(level1, 0xFF, sizeof level1);
7854 memset(level2, 0xFF, sizeof level2);
7855
7856 /* If there isn't a one-to-one mapping of NULL to \0,
7857 or if there are non-BMP characters, we need to use
7858 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007860 need_dict = 1;
7861 for (i = 1; i < 256; i++) {
7862 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 ch = PyUnicode_READ(kind, data, i);
7864 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 need_dict = 1;
7866 break;
7867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007869 /* unmapped character */
7870 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 l1 = ch >> 11;
7872 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 if (level1[l1] == 0xFF)
7874 level1[l1] = count2++;
7875 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007876 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877 }
7878
7879 if (count2 >= 0xFF || count3 >= 0xFF)
7880 need_dict = 1;
7881
7882 if (need_dict) {
7883 PyObject *result = PyDict_New();
7884 PyObject *key, *value;
7885 if (!result)
7886 return NULL;
7887 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007889 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007890 if (!key || !value)
7891 goto failed1;
7892 if (PyDict_SetItem(result, key, value) == -1)
7893 goto failed1;
7894 Py_DECREF(key);
7895 Py_DECREF(value);
7896 }
7897 return result;
7898 failed1:
7899 Py_XDECREF(key);
7900 Py_XDECREF(value);
7901 Py_DECREF(result);
7902 return NULL;
7903 }
7904
7905 /* Create a three-level trie */
7906 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7907 16*count2 + 128*count3 - 1);
7908 if (!result)
7909 return PyErr_NoMemory();
7910 PyObject_Init(result, &EncodingMapType);
7911 mresult = (struct encoding_map*)result;
7912 mresult->count2 = count2;
7913 mresult->count3 = count3;
7914 mlevel1 = mresult->level1;
7915 mlevel2 = mresult->level23;
7916 mlevel3 = mresult->level23 + 16*count2;
7917 memcpy(mlevel1, level1, 32);
7918 memset(mlevel2, 0xFF, 16*count2);
7919 memset(mlevel3, 0, 128*count3);
7920 count3 = 0;
7921 for (i = 1; i < 256; i++) {
7922 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 /* unmapped character */
7925 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 o1 = PyUnicode_READ(kind, data, i)>>11;
7927 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928 i2 = 16*mlevel1[o1] + o2;
7929 if (mlevel2[i2] == 0xFF)
7930 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 i3 = 128*mlevel2[i2] + o3;
7933 mlevel3[i3] = i;
7934 }
7935 return result;
7936}
7937
7938static int
Victor Stinner22168992011-11-20 17:09:18 +01007939encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940{
7941 struct encoding_map *map = (struct encoding_map*)mapping;
7942 int l1 = c>>11;
7943 int l2 = (c>>7) & 0xF;
7944 int l3 = c & 0x7F;
7945 int i;
7946
Victor Stinner22168992011-11-20 17:09:18 +01007947 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 if (c == 0)
7950 return 0;
7951 /* level 1*/
7952 i = map->level1[l1];
7953 if (i == 0xFF) {
7954 return -1;
7955 }
7956 /* level 2*/
7957 i = map->level23[16*i+l2];
7958 if (i == 0xFF) {
7959 return -1;
7960 }
7961 /* level 3 */
7962 i = map->level23[16*map->count2 + 128*i + l3];
7963 if (i == 0) {
7964 return -1;
7965 }
7966 return i;
7967}
7968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969/* Lookup the character ch in the mapping. If the character
7970 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007971 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007972static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007973charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974{
Christian Heimes217cfd12007-12-02 14:31:20 +00007975 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 PyObject *x;
7977
7978 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 x = PyObject_GetItem(mapping, w);
7981 Py_DECREF(w);
7982 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7984 /* No mapping found means: mapping is undefined. */
7985 PyErr_Clear();
7986 x = Py_None;
7987 Py_INCREF(x);
7988 return x;
7989 } else
7990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007992 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 long value = PyLong_AS_LONG(x);
7996 if (value < 0 || value > 255) {
7997 PyErr_SetString(PyExc_TypeError,
7998 "character mapping must be in range(256)");
7999 Py_DECREF(x);
8000 return NULL;
8001 }
8002 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008004 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 /* wrong return value */
8008 PyErr_Format(PyExc_TypeError,
8009 "character mapping must return integer, bytes or None, not %.400s",
8010 x->ob_type->tp_name);
8011 Py_DECREF(x);
8012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 }
8014}
8015
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008017charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8020 /* exponentially overallocate to minimize reallocations */
8021 if (requiredsize < 2*outsize)
8022 requiredsize = 2*outsize;
8023 if (_PyBytes_Resize(outobj, requiredsize))
8024 return -1;
8025 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026}
8027
Benjamin Peterson14339b62009-01-31 16:36:08 +00008028typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008031/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008032 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008033 space is available. Return a new reference to the object that
8034 was put in the output buffer, or Py_None, if the mapping was undefined
8035 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008036 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008037static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008038charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 PyObject *rep;
8042 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008043 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044
Christian Heimes90aa7642007-12-19 02:45:37 +00008045 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048 if (res == -1)
8049 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 if (outsize<requiredsize)
8051 if (charmapencode_resize(outobj, outpos, requiredsize))
8052 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008053 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 outstart[(*outpos)++] = (char)res;
8055 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 }
8057
8058 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 Py_DECREF(rep);
8063 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 if (PyLong_Check(rep)) {
8066 Py_ssize_t requiredsize = *outpos+1;
8067 if (outsize<requiredsize)
8068 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8069 Py_DECREF(rep);
8070 return enc_EXCEPTION;
8071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 else {
8076 const char *repchars = PyBytes_AS_STRING(rep);
8077 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8078 Py_ssize_t requiredsize = *outpos+repsize;
8079 if (outsize<requiredsize)
8080 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8081 Py_DECREF(rep);
8082 return enc_EXCEPTION;
8083 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008084 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 memcpy(outstart + *outpos, repchars, repsize);
8086 *outpos += repsize;
8087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008089 Py_DECREF(rep);
8090 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091}
8092
8093/* handle an error in PyUnicode_EncodeCharmap
8094 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008095static int
8096charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008099 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008100 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101{
8102 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008105 enum PyUnicode_Kind kind;
8106 void *data;
8107 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 Py_ssize_t collstartpos = *inpos;
8110 Py_ssize_t collendpos = *inpos+1;
8111 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112 char *encoding = "charmap";
8113 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008116 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 if (PyUnicode_READY(unicode) < 0)
8119 return -1;
8120 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 /* find all unencodable characters */
8122 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008124 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008125 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008126 val = encoding_map_lookup(ch, mapping);
8127 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 break;
8129 ++collendpos;
8130 continue;
8131 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008133 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8134 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if (rep==NULL)
8136 return -1;
8137 else if (rep!=Py_None) {
8138 Py_DECREF(rep);
8139 break;
8140 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 }
8144 /* cache callback name lookup
8145 * (if not done yet, i.e. it's the first error) */
8146 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 if ((errors==NULL) || (!strcmp(errors, "strict")))
8148 *known_errorHandler = 1;
8149 else if (!strcmp(errors, "replace"))
8150 *known_errorHandler = 2;
8151 else if (!strcmp(errors, "ignore"))
8152 *known_errorHandler = 3;
8153 else if (!strcmp(errors, "xmlcharrefreplace"))
8154 *known_errorHandler = 4;
8155 else
8156 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157 }
8158 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008160 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 return -1;
8162 case 2: /* replace */
8163 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 x = charmapencode_output('?', mapping, res, respos);
8165 if (x==enc_EXCEPTION) {
8166 return -1;
8167 }
8168 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008169 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 return -1;
8171 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 }
8173 /* fall through */
8174 case 3: /* ignore */
8175 *inpos = collendpos;
8176 break;
8177 case 4: /* xmlcharrefreplace */
8178 /* generate replacement (temporarily (mis)uses p) */
8179 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 char buffer[2+29+1+1];
8181 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008182 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 for (cp = buffer; *cp; ++cp) {
8184 x = charmapencode_output(*cp, mapping, res, respos);
8185 if (x==enc_EXCEPTION)
8186 return -1;
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 *inpos = collendpos;
8194 break;
8195 default:
8196 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008197 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008201 if (PyBytes_Check(repunicode)) {
8202 /* Directly copy bytes result to output. */
8203 Py_ssize_t outsize = PyBytes_Size(*res);
8204 Py_ssize_t requiredsize;
8205 repsize = PyBytes_Size(repunicode);
8206 requiredsize = *respos + repsize;
8207 if (requiredsize > outsize)
8208 /* Make room for all additional bytes. */
8209 if (charmapencode_resize(res, respos, requiredsize)) {
8210 Py_DECREF(repunicode);
8211 return -1;
8212 }
8213 memcpy(PyBytes_AsString(*res) + *respos,
8214 PyBytes_AsString(repunicode), repsize);
8215 *respos += repsize;
8216 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008217 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008218 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008219 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008221 if (PyUnicode_READY(repunicode) < 0) {
8222 Py_DECREF(repunicode);
8223 return -1;
8224 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008225 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008226 data = PyUnicode_DATA(repunicode);
8227 kind = PyUnicode_KIND(repunicode);
8228 for (index = 0; index < repsize; index++) {
8229 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8230 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008232 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return -1;
8234 }
8235 else if (x==enc_FAILED) {
8236 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008237 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return -1;
8239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 }
8241 *inpos = newpos;
8242 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 }
8244 return 0;
8245}
8246
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008248_PyUnicode_EncodeCharmap(PyObject *unicode,
8249 PyObject *mapping,
8250 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 /* output object */
8253 PyObject *res = NULL;
8254 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008258 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 PyObject *errorHandler = NULL;
8260 PyObject *exc = NULL;
8261 /* the following variable is used for caching string comparisons
8262 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8263 * 3=ignore, 4=xmlcharrefreplace */
8264 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 if (PyUnicode_READY(unicode) < 0)
8267 return NULL;
8268 size = PyUnicode_GET_LENGTH(unicode);
8269
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 /* Default to Latin-1 */
8271 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 /* allocate enough for a simple encoding without
8275 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 if (res == NULL)
8278 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008279 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008283 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008285 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 if (x==enc_EXCEPTION) /* error */
8287 goto onError;
8288 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 &exc,
8291 &known_errorHandler, &errorHandler, errors,
8292 &res, &respos)) {
8293 goto onError;
8294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 else
8297 /* done with this character => adjust input position */
8298 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008303 if (_PyBytes_Resize(&res, respos) < 0)
8304 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 Py_XDECREF(exc);
8307 Py_XDECREF(errorHandler);
8308 return res;
8309
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 Py_XDECREF(res);
8312 Py_XDECREF(exc);
8313 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 return NULL;
8315}
8316
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317/* Deprecated */
8318PyObject *
8319PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8320 Py_ssize_t size,
8321 PyObject *mapping,
8322 const char *errors)
8323{
8324 PyObject *result;
8325 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8326 if (unicode == NULL)
8327 return NULL;
8328 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8329 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008330 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331}
8332
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333PyObject *
8334PyUnicode_AsCharmapString(PyObject *unicode,
8335 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336{
8337 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 PyErr_BadArgument();
8339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342}
8343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345static void
8346make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348 Py_ssize_t startpos, Py_ssize_t endpos,
8349 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 *exceptionObject = _PyUnicodeTranslateError_Create(
8353 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8357 goto onError;
8358 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8359 goto onError;
8360 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8361 goto onError;
8362 return;
8363 onError:
8364 Py_DECREF(*exceptionObject);
8365 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
8367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375{
8376 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380}
8381
8382/* error handling callback helper:
8383 build arguments, call the callback and check the arguments,
8384 put the result into newpos and return the replacement string, which
8385 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static PyObject *
8387unicode_translate_call_errorhandler(const char *errors,
8388 PyObject **errorHandler,
8389 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391 Py_ssize_t startpos, Py_ssize_t endpos,
8392 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008394 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008396 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 PyObject *restuple;
8398 PyObject *resunicode;
8399
8400 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
8405
8406 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410
8411 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008416 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 Py_DECREF(restuple);
8418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 }
8420 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 &resunicode, &i_newpos)) {
8422 Py_DECREF(restuple);
8423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008425 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 else
8428 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 Py_INCREF(resunicode);
8435 Py_DECREF(restuple);
8436 return resunicode;
8437}
8438
8439/* Lookup the character ch in the mapping and put the result in result,
8440 which must be decrefed by the caller.
8441 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008442static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444{
Christian Heimes217cfd12007-12-02 14:31:20 +00008445 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 PyObject *x;
8447
8448 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 x = PyObject_GetItem(mapping, w);
8451 Py_DECREF(w);
8452 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8454 /* No mapping found means: use 1:1 mapping. */
8455 PyErr_Clear();
8456 *result = NULL;
8457 return 0;
8458 } else
8459 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 }
8461 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 *result = x;
8463 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008465 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 long value = PyLong_AS_LONG(x);
8467 long max = PyUnicode_GetMax();
8468 if (value < 0 || value > max) {
8469 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008470 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 Py_DECREF(x);
8472 return -1;
8473 }
8474 *result = x;
8475 return 0;
8476 }
8477 else if (PyUnicode_Check(x)) {
8478 *result = x;
8479 return 0;
8480 }
8481 else {
8482 /* wrong return value */
8483 PyErr_SetString(PyExc_TypeError,
8484 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 Py_DECREF(x);
8486 return -1;
8487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488}
8489/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 if not reallocate and adjust various state variables.
8491 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008492static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008497 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 /* exponentially overallocate to minimize reallocations */
8499 if (requiredsize < 2 * oldsize)
8500 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8502 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 }
8506 return 0;
8507}
8508/* lookup the character, put the result in the output string and adjust
8509 various state variables. Return a new reference to the object that
8510 was put in the output buffer in *result, or Py_None, if the mapping was
8511 undefined (in which case no character was written).
8512 The called must decref result.
8513 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008514static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8516 PyObject *mapping, Py_UCS4 **output,
8517 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8521 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 }
8527 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008529 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 }
8533 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 Py_ssize_t repsize;
8535 if (PyUnicode_READY(*res) == -1)
8536 return -1;
8537 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 if (repsize==1) {
8539 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 }
8542 else if (repsize!=0) {
8543 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 Py_ssize_t requiredsize = *opos +
8545 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 Py_ssize_t i;
8548 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 for(i = 0; i < repsize; i++)
8551 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 }
8554 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 return 0;
8557}
8558
Alexander Belopolsky40018472011-02-26 01:02:56 +00008559PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560_PyUnicode_TranslateCharmap(PyObject *input,
8561 PyObject *mapping,
8562 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 /* input object */
8565 char *idata;
8566 Py_ssize_t size, i;
8567 int kind;
8568 /* output buffer */
8569 Py_UCS4 *output = NULL;
8570 Py_ssize_t osize;
8571 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 char *reason = "character maps to <undefined>";
8575 PyObject *errorHandler = NULL;
8576 PyObject *exc = NULL;
8577 /* the following variable is used for caching string comparisons
8578 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8579 * 3=ignore, 4=xmlcharrefreplace */
8580 int known_errorHandler = -1;
8581
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 PyErr_BadArgument();
8584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 if (PyUnicode_READY(input) == -1)
8588 return NULL;
8589 idata = (char*)PyUnicode_DATA(input);
8590 kind = PyUnicode_KIND(input);
8591 size = PyUnicode_GET_LENGTH(input);
8592 i = 0;
8593
8594 if (size == 0) {
8595 Py_INCREF(input);
8596 return input;
8597 }
8598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 /* allocate enough for a simple 1:1 translation without
8600 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 osize = size;
8602 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8603 opos = 0;
8604 if (output == NULL) {
8605 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 /* try to encode it */
8611 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (charmaptranslate_output(input, i, mapping,
8613 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 Py_XDECREF(x);
8615 goto onError;
8616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 else { /* untranslatable character */
8621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8622 Py_ssize_t repsize;
8623 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 Py_ssize_t collstart = i;
8627 Py_ssize_t collend = i+1;
8628 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 while (collend < size) {
8632 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 goto onError;
8634 Py_XDECREF(x);
8635 if (x!=Py_None)
8636 break;
8637 ++collend;
8638 }
8639 /* cache callback name lookup
8640 * (if not done yet, i.e. it's the first error) */
8641 if (known_errorHandler==-1) {
8642 if ((errors==NULL) || (!strcmp(errors, "strict")))
8643 known_errorHandler = 1;
8644 else if (!strcmp(errors, "replace"))
8645 known_errorHandler = 2;
8646 else if (!strcmp(errors, "ignore"))
8647 known_errorHandler = 3;
8648 else if (!strcmp(errors, "xmlcharrefreplace"))
8649 known_errorHandler = 4;
8650 else
8651 known_errorHandler = 0;
8652 }
8653 switch (known_errorHandler) {
8654 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 raise_translate_exception(&exc, input, collstart,
8656 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 case 2: /* replace */
8659 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 for (coll = collstart; coll<collend; coll++)
8661 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 /* fall through */
8663 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 break;
8666 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 /* generate replacement (temporarily (mis)uses i) */
8668 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 char buffer[2+29+1+1];
8670 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8672 if (charmaptranslate_makespace(&output, &osize,
8673 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 goto onError;
8675 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 break;
8680 default:
8681 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 reason, input, &exc,
8683 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008684 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008686 if (PyUnicode_READY(repunicode) < 0) {
8687 Py_DECREF(repunicode);
8688 goto onError;
8689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 repsize = PyUnicode_GET_LENGTH(repunicode);
8692 if (charmaptranslate_makespace(&output, &osize,
8693 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(repunicode);
8695 goto onError;
8696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 for (uni2 = 0; repsize-->0; ++uni2)
8698 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8699 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 }
8703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8705 if (!res)
8706 goto onError;
8707 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 Py_XDECREF(exc);
8709 Py_XDECREF(errorHandler);
8710 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 Py_XDECREF(exc);
8715 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 return NULL;
8717}
8718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719/* Deprecated. Use PyUnicode_Translate instead. */
8720PyObject *
8721PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8722 Py_ssize_t size,
8723 PyObject *mapping,
8724 const char *errors)
8725{
8726 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8727 if (!unicode)
8728 return NULL;
8729 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8730}
8731
Alexander Belopolsky40018472011-02-26 01:02:56 +00008732PyObject *
8733PyUnicode_Translate(PyObject *str,
8734 PyObject *mapping,
8735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736{
8737 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008738
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 str = PyUnicode_FromObject(str);
8740 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 Py_DECREF(str);
8744 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008745
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 Py_XDECREF(str);
8748 return NULL;
8749}
Tim Petersced69f82003-09-16 20:30:58 +00008750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008752fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753{
8754 /* No need to call PyUnicode_READY(self) because this function is only
8755 called as a callback from fixup() which does it already. */
8756 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8757 const int kind = PyUnicode_KIND(self);
8758 void *data = PyUnicode_DATA(self);
8759 Py_UCS4 maxchar = 0, ch, fixed;
8760 Py_ssize_t i;
8761
8762 for (i = 0; i < len; ++i) {
8763 ch = PyUnicode_READ(kind, data, i);
8764 fixed = 0;
8765 if (ch > 127) {
8766 if (Py_UNICODE_ISSPACE(ch))
8767 fixed = ' ';
8768 else {
8769 const int decimal = Py_UNICODE_TODECIMAL(ch);
8770 if (decimal >= 0)
8771 fixed = '0' + decimal;
8772 }
8773 if (fixed != 0) {
8774 if (fixed > maxchar)
8775 maxchar = fixed;
8776 PyUnicode_WRITE(kind, data, i, fixed);
8777 }
8778 else if (ch > maxchar)
8779 maxchar = ch;
8780 }
8781 else if (ch > maxchar)
8782 maxchar = ch;
8783 }
8784
8785 return maxchar;
8786}
8787
8788PyObject *
8789_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8790{
8791 if (!PyUnicode_Check(unicode)) {
8792 PyErr_BadInternalCall();
8793 return NULL;
8794 }
8795 if (PyUnicode_READY(unicode) == -1)
8796 return NULL;
8797 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8798 /* If the string is already ASCII, just return the same string */
8799 Py_INCREF(unicode);
8800 return unicode;
8801 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008802 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803}
8804
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008805PyObject *
8806PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8807 Py_ssize_t length)
8808{
Victor Stinnerf0124502011-11-21 23:12:56 +01008809 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008810 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008811 Py_UCS4 maxchar;
8812 enum PyUnicode_Kind kind;
8813 void *data;
8814
8815 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008816 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008817 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008818 if (ch > 127) {
8819 int decimal = Py_UNICODE_TODECIMAL(ch);
8820 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008821 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008822 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008823 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008824 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008825
8826 /* Copy to a new string */
8827 decimal = PyUnicode_New(length, maxchar);
8828 if (decimal == NULL)
8829 return decimal;
8830 kind = PyUnicode_KIND(decimal);
8831 data = PyUnicode_DATA(decimal);
8832 /* Iterate over code points */
8833 for (i = 0; i < length; i++) {
8834 Py_UNICODE ch = s[i];
8835 if (ch > 127) {
8836 int decimal = Py_UNICODE_TODECIMAL(ch);
8837 if (decimal >= 0)
8838 ch = '0' + decimal;
8839 }
8840 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008842 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008843}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008844/* --- Decimal Encoder ---------------------------------------------------- */
8845
Alexander Belopolsky40018472011-02-26 01:02:56 +00008846int
8847PyUnicode_EncodeDecimal(Py_UNICODE *s,
8848 Py_ssize_t length,
8849 char *output,
8850 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008851{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008852 PyObject *errorHandler = NULL;
8853 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008854 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855 const char *encoding = "decimal";
8856 const char *reason = "invalid decimal Unicode string";
8857 /* the following variable is used for caching string comparisons
8858 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8859 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008860 Py_ssize_t i, j;
8861 enum PyUnicode_Kind kind;
8862 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008863
8864 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 PyErr_BadArgument();
8866 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867 }
8868
Victor Stinner42bf7752011-11-21 22:52:58 +01008869 unicode = PyUnicode_FromUnicode(s, length);
8870 if (unicode == NULL)
8871 return -1;
8872
8873 if (PyUnicode_READY(unicode) < 0)
8874 goto onError;
8875 kind = PyUnicode_KIND(unicode);
8876 data = PyUnicode_DATA(unicode);
8877
Victor Stinnerb84d7232011-11-22 01:50:07 +01008878 for (i=0; i < length; ) {
Victor Stinner42bf7752011-11-21 22:52:58 +01008879 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008881 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008882
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008885 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 decimal = Py_UNICODE_TODECIMAL(ch);
8889 if (decimal >= 0) {
8890 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008891 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 continue;
8893 }
8894 if (0 < ch && ch < 256) {
8895 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008896 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 continue;
8898 }
8899 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008900 startpos = i;
8901 endpos = i+1;
8902 for (; endpos < length; endpos++) {
8903 ch = PyUnicode_READ(kind, data, endpos);
8904 if ((0 < ch && ch < 256) ||
Victor Stinnerb84d7232011-11-22 01:50:07 +01008905 Py_UNICODE_ISSPACE(ch) ||
8906 0 <= Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 break;
8908 }
8909 /* cache callback name lookup
8910 * (if not done yet, i.e. it's the first error) */
8911 if (known_errorHandler==-1) {
8912 if ((errors==NULL) || (!strcmp(errors, "strict")))
8913 known_errorHandler = 1;
8914 else if (!strcmp(errors, "replace"))
8915 known_errorHandler = 2;
8916 else if (!strcmp(errors, "ignore"))
8917 known_errorHandler = 3;
8918 else if (!strcmp(errors, "xmlcharrefreplace"))
8919 known_errorHandler = 4;
8920 else
8921 known_errorHandler = 0;
8922 }
8923 switch (known_errorHandler) {
8924 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008925 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 goto onError;
8927 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008928 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 *output++ = '?';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008930 i = endpos;
8931 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00008932 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008933 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 break;
8935 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008936 /* generate replacement */
8937 for (j=startpos; j < endpos; j++) {
8938 ch = PyUnicode_READ(kind, data, i);
8939 output += sprintf(output, "&#%d;", (int)ch);
8940 i++;
8941 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 break;
8943 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008944 {
8945 PyObject *repunicode;
8946 Py_ssize_t repsize, newpos, k;
8947 enum PyUnicode_Kind repkind;
8948 void *repdata;
8949
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008951 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008952 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 if (repunicode == NULL)
8954 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008955 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008956 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008957 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8958 Py_DECREF(repunicode);
8959 goto onError;
8960 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008961 if (PyUnicode_READY(repunicode) < 0) {
8962 Py_DECREF(repunicode);
8963 goto onError;
8964 }
8965 repkind = PyUnicode_KIND(repunicode);
8966 repdata = PyUnicode_DATA(repunicode);
8967
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 /* generate replacement */
8969 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008970 for (k=0; k<repsize; k++) {
8971 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (Py_UNICODE_ISSPACE(ch))
8973 *output++ = ' ';
8974 else {
8975 decimal = Py_UNICODE_TODECIMAL(ch);
8976 if (decimal >= 0)
8977 *output++ = '0' + decimal;
8978 else if (0 < ch && ch < 256)
8979 *output++ = (char)ch;
8980 else {
8981 Py_DECREF(repunicode);
8982 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008983 unicode, startpos, endpos,
8984 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 goto onError;
8986 }
8987 }
8988 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008989 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 Py_DECREF(repunicode);
8991 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008992 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008993 }
8994 /* 0-terminate the output string */
8995 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008996 Py_XDECREF(exc);
8997 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008998 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008999 return 0;
9000
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 Py_XDECREF(exc);
9003 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01009004 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009005 return -1;
9006}
9007
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008/* --- Helpers ------------------------------------------------------------ */
9009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009011any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 Py_ssize_t start,
9013 Py_ssize_t end)
9014{
9015 int kind1, kind2, kind;
9016 void *buf1, *buf2;
9017 Py_ssize_t len1, len2, result;
9018
9019 kind1 = PyUnicode_KIND(s1);
9020 kind2 = PyUnicode_KIND(s2);
9021 kind = kind1 > kind2 ? kind1 : kind2;
9022 buf1 = PyUnicode_DATA(s1);
9023 buf2 = PyUnicode_DATA(s2);
9024 if (kind1 != kind)
9025 buf1 = _PyUnicode_AsKind(s1, kind);
9026 if (!buf1)
9027 return -2;
9028 if (kind2 != kind)
9029 buf2 = _PyUnicode_AsKind(s2, kind);
9030 if (!buf2) {
9031 if (kind1 != kind) PyMem_Free(buf1);
9032 return -2;
9033 }
9034 len1 = PyUnicode_GET_LENGTH(s1);
9035 len2 = PyUnicode_GET_LENGTH(s2);
9036
Victor Stinner794d5672011-10-10 03:21:36 +02009037 if (direction > 0) {
9038 switch(kind) {
9039 case PyUnicode_1BYTE_KIND:
9040 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9041 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9042 else
9043 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9044 break;
9045 case PyUnicode_2BYTE_KIND:
9046 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9047 break;
9048 case PyUnicode_4BYTE_KIND:
9049 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9050 break;
9051 default:
9052 assert(0); result = -2;
9053 }
9054 }
9055 else {
9056 switch(kind) {
9057 case PyUnicode_1BYTE_KIND:
9058 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9059 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9060 else
9061 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9062 break;
9063 case PyUnicode_2BYTE_KIND:
9064 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9065 break;
9066 case PyUnicode_4BYTE_KIND:
9067 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9068 break;
9069 default:
9070 assert(0); result = -2;
9071 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 }
9073
9074 if (kind1 != kind)
9075 PyMem_Free(buf1);
9076 if (kind2 != kind)
9077 PyMem_Free(buf2);
9078
9079 return result;
9080}
9081
9082Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009083_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 Py_ssize_t n_buffer,
9085 void *digits, Py_ssize_t n_digits,
9086 Py_ssize_t min_width,
9087 const char *grouping,
9088 const char *thousands_sep)
9089{
9090 switch(kind) {
9091 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009092 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9093 return _PyUnicode_ascii_InsertThousandsGrouping(
9094 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9095 min_width, grouping, thousands_sep);
9096 else
9097 return _PyUnicode_ucs1_InsertThousandsGrouping(
9098 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9099 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 case PyUnicode_2BYTE_KIND:
9101 return _PyUnicode_ucs2_InsertThousandsGrouping(
9102 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9103 min_width, grouping, thousands_sep);
9104 case PyUnicode_4BYTE_KIND:
9105 return _PyUnicode_ucs4_InsertThousandsGrouping(
9106 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9107 min_width, grouping, thousands_sep);
9108 }
9109 assert(0);
9110 return -1;
9111}
9112
9113
Thomas Wouters477c8d52006-05-27 19:21:47 +00009114/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009115#define ADJUST_INDICES(start, end, len) \
9116 if (end > len) \
9117 end = len; \
9118 else if (end < 0) { \
9119 end += len; \
9120 if (end < 0) \
9121 end = 0; \
9122 } \
9123 if (start < 0) { \
9124 start += len; \
9125 if (start < 0) \
9126 start = 0; \
9127 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009128
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129Py_ssize_t
9130PyUnicode_Count(PyObject *str,
9131 PyObject *substr,
9132 Py_ssize_t start,
9133 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009135 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009136 PyObject* str_obj;
9137 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 int kind1, kind2, kind;
9139 void *buf1 = NULL, *buf2 = NULL;
9140 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009141
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009142 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009145 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009146 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 Py_DECREF(str_obj);
9148 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149 }
Tim Petersced69f82003-09-16 20:30:58 +00009150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 kind1 = PyUnicode_KIND(str_obj);
9152 kind2 = PyUnicode_KIND(sub_obj);
9153 kind = kind1 > kind2 ? kind1 : kind2;
9154 buf1 = PyUnicode_DATA(str_obj);
9155 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009156 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 if (!buf1)
9158 goto onError;
9159 buf2 = PyUnicode_DATA(sub_obj);
9160 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009161 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 if (!buf2)
9163 goto onError;
9164 len1 = PyUnicode_GET_LENGTH(str_obj);
9165 len2 = PyUnicode_GET_LENGTH(sub_obj);
9166
9167 ADJUST_INDICES(start, end, len1);
9168 switch(kind) {
9169 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009170 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9171 result = asciilib_count(
9172 ((Py_UCS1*)buf1) + start, end - start,
9173 buf2, len2, PY_SSIZE_T_MAX
9174 );
9175 else
9176 result = ucs1lib_count(
9177 ((Py_UCS1*)buf1) + start, end - start,
9178 buf2, len2, PY_SSIZE_T_MAX
9179 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 break;
9181 case PyUnicode_2BYTE_KIND:
9182 result = ucs2lib_count(
9183 ((Py_UCS2*)buf1) + start, end - start,
9184 buf2, len2, PY_SSIZE_T_MAX
9185 );
9186 break;
9187 case PyUnicode_4BYTE_KIND:
9188 result = ucs4lib_count(
9189 ((Py_UCS4*)buf1) + start, end - start,
9190 buf2, len2, PY_SSIZE_T_MAX
9191 );
9192 break;
9193 default:
9194 assert(0); result = 0;
9195 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009196
9197 Py_DECREF(sub_obj);
9198 Py_DECREF(str_obj);
9199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 if (kind1 != kind)
9201 PyMem_Free(buf1);
9202 if (kind2 != kind)
9203 PyMem_Free(buf2);
9204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 onError:
9207 Py_DECREF(sub_obj);
9208 Py_DECREF(str_obj);
9209 if (kind1 != kind && buf1)
9210 PyMem_Free(buf1);
9211 if (kind2 != kind && buf2)
9212 PyMem_Free(buf2);
9213 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214}
9215
Alexander Belopolsky40018472011-02-26 01:02:56 +00009216Py_ssize_t
9217PyUnicode_Find(PyObject *str,
9218 PyObject *sub,
9219 Py_ssize_t start,
9220 Py_ssize_t end,
9221 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009223 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009228 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 Py_DECREF(str);
9231 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 }
Tim Petersced69f82003-09-16 20:30:58 +00009233
Victor Stinner794d5672011-10-10 03:21:36 +02009234 result = any_find_slice(direction,
9235 str, sub, start, end
9236 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009237
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009239 Py_DECREF(sub);
9240
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 return result;
9242}
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244Py_ssize_t
9245PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9246 Py_ssize_t start, Py_ssize_t end,
9247 int direction)
9248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009250 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 if (PyUnicode_READY(str) == -1)
9252 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009253 if (start < 0 || end < 0) {
9254 PyErr_SetString(PyExc_IndexError, "string index out of range");
9255 return -2;
9256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (end > PyUnicode_GET_LENGTH(str))
9258 end = PyUnicode_GET_LENGTH(str);
9259 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009260 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9261 kind, end-start, ch, direction);
9262 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009264 else
9265 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266}
9267
Alexander Belopolsky40018472011-02-26 01:02:56 +00009268static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009269tailmatch(PyObject *self,
9270 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009271 Py_ssize_t start,
9272 Py_ssize_t end,
9273 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 int kind_self;
9276 int kind_sub;
9277 void *data_self;
9278 void *data_sub;
9279 Py_ssize_t offset;
9280 Py_ssize_t i;
9281 Py_ssize_t end_sub;
9282
9283 if (PyUnicode_READY(self) == -1 ||
9284 PyUnicode_READY(substring) == -1)
9285 return 0;
9286
9287 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 return 1;
9289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9291 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 kind_self = PyUnicode_KIND(self);
9296 data_self = PyUnicode_DATA(self);
9297 kind_sub = PyUnicode_KIND(substring);
9298 data_sub = PyUnicode_DATA(substring);
9299 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9300
9301 if (direction > 0)
9302 offset = end;
9303 else
9304 offset = start;
9305
9306 if (PyUnicode_READ(kind_self, data_self, offset) ==
9307 PyUnicode_READ(kind_sub, data_sub, 0) &&
9308 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9309 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9310 /* If both are of the same kind, memcmp is sufficient */
9311 if (kind_self == kind_sub) {
9312 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009313 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 data_sub,
9315 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009316 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 }
9318 /* otherwise we have to compare each character by first accesing it */
9319 else {
9320 /* We do not need to compare 0 and len(substring)-1 because
9321 the if statement above ensured already that they are equal
9322 when we end up here. */
9323 // TODO: honor direction and do a forward or backwards search
9324 for (i = 1; i < end_sub; ++i) {
9325 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9326 PyUnicode_READ(kind_sub, data_sub, i))
9327 return 0;
9328 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331 }
9332
9333 return 0;
9334}
9335
Alexander Belopolsky40018472011-02-26 01:02:56 +00009336Py_ssize_t
9337PyUnicode_Tailmatch(PyObject *str,
9338 PyObject *substr,
9339 Py_ssize_t start,
9340 Py_ssize_t end,
9341 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009343 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009344
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 str = PyUnicode_FromObject(str);
9346 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 substr = PyUnicode_FromObject(substr);
9349 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 Py_DECREF(str);
9351 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 }
Tim Petersced69f82003-09-16 20:30:58 +00009353
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009354 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009355 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356 Py_DECREF(str);
9357 Py_DECREF(substr);
9358 return result;
9359}
9360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361/* Apply fixfct filter to the Unicode object self and return a
9362 reference to the modified object */
9363
Alexander Belopolsky40018472011-02-26 01:02:56 +00009364static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009365fixup(PyObject *self,
9366 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 PyObject *u;
9369 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370
Victor Stinner87af4f22011-11-21 23:03:47 +01009371 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009374 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 /* fix functions return the new maximum character in a string,
9377 if the kind of the resulting unicode object does not change,
9378 everything is fine. Otherwise we need to change the string kind
9379 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009380 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 if (maxchar_new == 0)
9382 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9383 else if (maxchar_new <= 127)
9384 maxchar_new = 127;
9385 else if (maxchar_new <= 255)
9386 maxchar_new = 255;
9387 else if (maxchar_new <= 65535)
9388 maxchar_new = 65535;
9389 else
9390 maxchar_new = 1114111; /* 0x10ffff */
9391
9392 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 /* fixfct should return TRUE if it modified the buffer. If
9394 FALSE, return a reference to the original buffer instead
9395 (to save space, not time) */
9396 Py_INCREF(self);
9397 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009398 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 else if (maxchar_new == maxchar_old) {
9401 return u;
9402 }
9403 else {
9404 /* In case the maximum character changed, we need to
9405 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009406 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (v == NULL) {
9408 Py_DECREF(u);
9409 return NULL;
9410 }
9411 if (maxchar_new > maxchar_old) {
9412 /* If the maxchar increased so that the kind changed, not all
9413 characters are representable anymore and we need to fix the
9414 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009415 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009416 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9418 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009419 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009420 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009421 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422
9423 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009424 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 return v;
9426 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427}
9428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009430fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009432 /* No need to call PyUnicode_READY(self) because this function is only
9433 called as a callback from fixup() which does it already. */
9434 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9435 const int kind = PyUnicode_KIND(self);
9436 void *data = PyUnicode_DATA(self);
9437 int touched = 0;
9438 Py_UCS4 maxchar = 0;
9439 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 for (i = 0; i < len; ++i) {
9442 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9443 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9444 if (up != ch) {
9445 if (up > maxchar)
9446 maxchar = up;
9447 PyUnicode_WRITE(kind, data, i, up);
9448 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 else if (ch > maxchar)
9451 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 }
9453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 if (touched)
9455 return maxchar;
9456 else
9457 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458}
9459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009461fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9464 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 const int kind = PyUnicode_KIND(self);
9466 void *data = PyUnicode_DATA(self);
9467 int touched = 0;
9468 Py_UCS4 maxchar = 0;
9469 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 for(i = 0; i < len; ++i) {
9472 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9473 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9474 if (lo != ch) {
9475 if (lo > maxchar)
9476 maxchar = lo;
9477 PyUnicode_WRITE(kind, data, i, lo);
9478 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 else if (ch > maxchar)
9481 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 }
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 if (touched)
9485 return maxchar;
9486 else
9487 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009491fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9494 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9495 const int kind = PyUnicode_KIND(self);
9496 void *data = PyUnicode_DATA(self);
9497 int touched = 0;
9498 Py_UCS4 maxchar = 0;
9499 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 for(i = 0; i < len; ++i) {
9502 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9503 Py_UCS4 nu = 0;
9504
9505 if (Py_UNICODE_ISUPPER(ch))
9506 nu = Py_UNICODE_TOLOWER(ch);
9507 else if (Py_UNICODE_ISLOWER(ch))
9508 nu = Py_UNICODE_TOUPPER(ch);
9509
9510 if (nu != 0) {
9511 if (nu > maxchar)
9512 maxchar = nu;
9513 PyUnicode_WRITE(kind, data, i, nu);
9514 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 else if (ch > maxchar)
9517 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
9519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 if (touched)
9521 return maxchar;
9522 else
9523 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524}
9525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009527fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9530 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9531 const int kind = PyUnicode_KIND(self);
9532 void *data = PyUnicode_DATA(self);
9533 int touched = 0;
9534 Py_UCS4 maxchar = 0;
9535 Py_ssize_t i = 0;
9536 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009537
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009538 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540
9541 ch = PyUnicode_READ(kind, data, i);
9542 if (!Py_UNICODE_ISUPPER(ch)) {
9543 maxchar = Py_UNICODE_TOUPPER(ch);
9544 PyUnicode_WRITE(kind, data, i, maxchar);
9545 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 ++i;
9548 for(; i < len; ++i) {
9549 ch = PyUnicode_READ(kind, data, i);
9550 if (!Py_UNICODE_ISLOWER(ch)) {
9551 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9552 if (lo > maxchar)
9553 maxchar = lo;
9554 PyUnicode_WRITE(kind, data, i, lo);
9555 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 else if (ch > maxchar)
9558 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560
9561 if (touched)
9562 return maxchar;
9563 else
9564 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565}
9566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009567static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009568fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9571 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9572 const int kind = PyUnicode_KIND(self);
9573 void *data = PyUnicode_DATA(self);
9574 Py_UCS4 maxchar = 0;
9575 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576 int previous_is_cased;
9577
9578 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (len == 1) {
9580 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9581 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9582 if (ti != ch) {
9583 PyUnicode_WRITE(kind, data, i, ti);
9584 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 }
9586 else
9587 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 for(; i < len; ++i) {
9591 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9592 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009593
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597 nu = Py_UNICODE_TOTITLE(ch);
9598
9599 if (nu > maxchar)
9600 maxchar = nu;
9601 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009602
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 if (Py_UNICODE_ISLOWER(ch) ||
9604 Py_UNICODE_ISUPPER(ch) ||
9605 Py_UNICODE_ISTITLE(ch))
9606 previous_is_cased = 1;
9607 else
9608 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611}
9612
Tim Peters8ce9f162004-08-27 01:49:32 +00009613PyObject *
9614PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009619 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009620 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9621 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009622 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009624 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009626 int use_memcpy;
9627 unsigned char *res_data = NULL, *sep_data = NULL;
9628 PyObject *last_obj;
9629 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 fseq = PySequence_Fast(seq, "");
9632 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009633 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009634 }
9635
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009636 /* NOTE: the following code can't call back into Python code,
9637 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009638 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009639
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 seqlen = PySequence_Fast_GET_SIZE(fseq);
9641 /* If empty sequence, return u"". */
9642 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009643 Py_DECREF(fseq);
9644 Py_INCREF(unicode_empty);
9645 res = unicode_empty;
9646 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009648
Tim Peters05eba1f2004-08-27 21:32:02 +00009649 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009650 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009651 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009652 if (seqlen == 1) {
9653 if (PyUnicode_CheckExact(items[0])) {
9654 res = items[0];
9655 Py_INCREF(res);
9656 Py_DECREF(fseq);
9657 return res;
9658 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009659 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009660 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009661 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009662 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009663 /* Set up sep and seplen */
9664 if (separator == NULL) {
9665 /* fall back to a blank space separator */
9666 sep = PyUnicode_FromOrdinal(' ');
9667 if (!sep)
9668 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009669 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009670 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009671 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009672 else {
9673 if (!PyUnicode_Check(separator)) {
9674 PyErr_Format(PyExc_TypeError,
9675 "separator: expected str instance,"
9676 " %.80s found",
9677 Py_TYPE(separator)->tp_name);
9678 goto onError;
9679 }
9680 if (PyUnicode_READY(separator))
9681 goto onError;
9682 sep = separator;
9683 seplen = PyUnicode_GET_LENGTH(separator);
9684 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9685 /* inc refcount to keep this code path symmetric with the
9686 above case of a blank separator */
9687 Py_INCREF(sep);
9688 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009689 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009690 }
9691
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009692 /* There are at least two things to join, or else we have a subclass
9693 * of str in the sequence.
9694 * Do a pre-pass to figure out the total amount of space we'll
9695 * need (sz), and see whether all argument are strings.
9696 */
9697 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009698#ifdef Py_DEBUG
9699 use_memcpy = 0;
9700#else
9701 use_memcpy = 1;
9702#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009703 for (i = 0; i < seqlen; i++) {
9704 const Py_ssize_t old_sz = sz;
9705 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009706 if (!PyUnicode_Check(item)) {
9707 PyErr_Format(PyExc_TypeError,
9708 "sequence item %zd: expected str instance,"
9709 " %.80s found",
9710 i, Py_TYPE(item)->tp_name);
9711 goto onError;
9712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 if (PyUnicode_READY(item) == -1)
9714 goto onError;
9715 sz += PyUnicode_GET_LENGTH(item);
9716 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009717 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009718 if (i != 0)
9719 sz += seplen;
9720 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9721 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009722 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 goto onError;
9724 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009725 if (use_memcpy && last_obj != NULL) {
9726 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9727 use_memcpy = 0;
9728 }
9729 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009730 }
Tim Petersced69f82003-09-16 20:30:58 +00009731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009733 if (res == NULL)
9734 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009735
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009736 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009737#ifdef Py_DEBUG
9738 use_memcpy = 0;
9739#else
9740 if (use_memcpy) {
9741 res_data = PyUnicode_1BYTE_DATA(res);
9742 kind = PyUnicode_KIND(res);
9743 if (seplen != 0)
9744 sep_data = PyUnicode_1BYTE_DATA(sep);
9745 }
9746#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009748 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009749 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009750 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009751 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009752 if (use_memcpy) {
9753 Py_MEMCPY(res_data,
9754 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009755 kind * seplen);
9756 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009757 }
9758 else {
9759 copy_characters(res, res_offset, sep, 0, seplen);
9760 res_offset += seplen;
9761 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009762 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009763 itemlen = PyUnicode_GET_LENGTH(item);
9764 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009765 if (use_memcpy) {
9766 Py_MEMCPY(res_data,
9767 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009768 kind * itemlen);
9769 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009770 }
9771 else {
9772 copy_characters(res, res_offset, item, 0, itemlen);
9773 res_offset += itemlen;
9774 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009775 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009776 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009777 if (use_memcpy)
9778 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009779 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009780 else
9781 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009782
Tim Peters05eba1f2004-08-27 21:32:02 +00009783 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009785 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009789 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009791 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792 return NULL;
9793}
9794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795#define FILL(kind, data, value, start, length) \
9796 do { \
9797 Py_ssize_t i_ = 0; \
9798 assert(kind != PyUnicode_WCHAR_KIND); \
9799 switch ((kind)) { \
9800 case PyUnicode_1BYTE_KIND: { \
9801 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9802 memset(to_, (unsigned char)value, length); \
9803 break; \
9804 } \
9805 case PyUnicode_2BYTE_KIND: { \
9806 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9807 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9808 break; \
9809 } \
9810 default: { \
9811 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9812 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9813 break; \
9814 } \
9815 } \
9816 } while (0)
9817
Victor Stinner9310abb2011-10-05 00:59:23 +02009818static PyObject *
9819pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009820 Py_ssize_t left,
9821 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 PyObject *u;
9825 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009826 int kind;
9827 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828
9829 if (left < 0)
9830 left = 0;
9831 if (right < 0)
9832 right = 0;
9833
Tim Peters7a29bd52001-09-12 03:03:31 +00009834 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 Py_INCREF(self);
9836 return self;
9837 }
9838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9840 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009841 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9842 return NULL;
9843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9845 if (fill > maxchar)
9846 maxchar = fill;
9847 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009848 if (!u)
9849 return NULL;
9850
9851 kind = PyUnicode_KIND(u);
9852 data = PyUnicode_DATA(u);
9853 if (left)
9854 FILL(kind, data, fill, 0, left);
9855 if (right)
9856 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009857 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009858 assert(_PyUnicode_CheckConsistency(u, 1));
9859 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862
Alexander Belopolsky40018472011-02-26 01:02:56 +00009863PyObject *
9864PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
9868 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 switch(PyUnicode_KIND(string)) {
9873 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009874 if (PyUnicode_IS_ASCII(string))
9875 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009876 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009877 PyUnicode_GET_LENGTH(string), keepends);
9878 else
9879 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 break;
9883 case PyUnicode_2BYTE_KIND:
9884 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 PyUnicode_GET_LENGTH(string), keepends);
9887 break;
9888 case PyUnicode_4BYTE_KIND:
9889 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 PyUnicode_GET_LENGTH(string), keepends);
9892 break;
9893 default:
9894 assert(0);
9895 list = 0;
9896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897 Py_DECREF(string);
9898 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899}
9900
Alexander Belopolsky40018472011-02-26 01:02:56 +00009901static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009902split(PyObject *self,
9903 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009904 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 int kind1, kind2, kind;
9907 void *buf1, *buf2;
9908 Py_ssize_t len1, len2;
9909 PyObject* out;
9910
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009912 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (PyUnicode_READY(self) == -1)
9915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (substring == NULL)
9918 switch(PyUnicode_KIND(self)) {
9919 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009920 if (PyUnicode_IS_ASCII(self))
9921 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 else
9926 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 case PyUnicode_2BYTE_KIND:
9931 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 PyUnicode_GET_LENGTH(self), maxcount
9934 );
9935 case PyUnicode_4BYTE_KIND:
9936 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009937 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 PyUnicode_GET_LENGTH(self), maxcount
9939 );
9940 default:
9941 assert(0);
9942 return NULL;
9943 }
9944
9945 if (PyUnicode_READY(substring) == -1)
9946 return NULL;
9947
9948 kind1 = PyUnicode_KIND(self);
9949 kind2 = PyUnicode_KIND(substring);
9950 kind = kind1 > kind2 ? kind1 : kind2;
9951 buf1 = PyUnicode_DATA(self);
9952 buf2 = PyUnicode_DATA(substring);
9953 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009954 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 if (!buf1)
9956 return NULL;
9957 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (!buf2) {
9960 if (kind1 != kind) PyMem_Free(buf1);
9961 return NULL;
9962 }
9963 len1 = PyUnicode_GET_LENGTH(self);
9964 len2 = PyUnicode_GET_LENGTH(substring);
9965
9966 switch(kind) {
9967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9969 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 else
9972 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009973 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 break;
9975 case PyUnicode_2BYTE_KIND:
9976 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 break;
9979 case PyUnicode_4BYTE_KIND:
9980 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 break;
9983 default:
9984 out = NULL;
9985 }
9986 if (kind1 != kind)
9987 PyMem_Free(buf1);
9988 if (kind2 != kind)
9989 PyMem_Free(buf2);
9990 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991}
9992
Alexander Belopolsky40018472011-02-26 01:02:56 +00009993static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009994rsplit(PyObject *self,
9995 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009996 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 int kind1, kind2, kind;
9999 void *buf1, *buf2;
10000 Py_ssize_t len1, len2;
10001 PyObject* out;
10002
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010003 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010004 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (PyUnicode_READY(self) == -1)
10007 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (substring == NULL)
10010 switch(PyUnicode_KIND(self)) {
10011 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010012 if (PyUnicode_IS_ASCII(self))
10013 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010014 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010015 PyUnicode_GET_LENGTH(self), maxcount
10016 );
10017 else
10018 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010019 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010020 PyUnicode_GET_LENGTH(self), maxcount
10021 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 case PyUnicode_2BYTE_KIND:
10023 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010024 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 PyUnicode_GET_LENGTH(self), maxcount
10026 );
10027 case PyUnicode_4BYTE_KIND:
10028 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 PyUnicode_GET_LENGTH(self), maxcount
10031 );
10032 default:
10033 assert(0);
10034 return NULL;
10035 }
10036
10037 if (PyUnicode_READY(substring) == -1)
10038 return NULL;
10039
10040 kind1 = PyUnicode_KIND(self);
10041 kind2 = PyUnicode_KIND(substring);
10042 kind = kind1 > kind2 ? kind1 : kind2;
10043 buf1 = PyUnicode_DATA(self);
10044 buf2 = PyUnicode_DATA(substring);
10045 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (!buf1)
10048 return NULL;
10049 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 if (!buf2) {
10052 if (kind1 != kind) PyMem_Free(buf1);
10053 return NULL;
10054 }
10055 len1 = PyUnicode_GET_LENGTH(self);
10056 len2 = PyUnicode_GET_LENGTH(substring);
10057
10058 switch(kind) {
10059 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010060 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10061 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010062 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010063 else
10064 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010065 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 break;
10067 case PyUnicode_2BYTE_KIND:
10068 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010069 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 break;
10071 case PyUnicode_4BYTE_KIND:
10072 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 break;
10075 default:
10076 out = NULL;
10077 }
10078 if (kind1 != kind)
10079 PyMem_Free(buf1);
10080 if (kind2 != kind)
10081 PyMem_Free(buf2);
10082 return out;
10083}
10084
10085static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10087 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088{
10089 switch(kind) {
10090 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10092 return asciilib_find(buf1, len1, buf2, len2, offset);
10093 else
10094 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 case PyUnicode_2BYTE_KIND:
10096 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10097 case PyUnicode_4BYTE_KIND:
10098 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10099 }
10100 assert(0);
10101 return -1;
10102}
10103
10104static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10106 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107{
10108 switch(kind) {
10109 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010110 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10111 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10112 else
10113 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 case PyUnicode_2BYTE_KIND:
10115 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10116 case PyUnicode_4BYTE_KIND:
10117 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10118 }
10119 assert(0);
10120 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010121}
10122
Alexander Belopolsky40018472011-02-26 01:02:56 +000010123static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124replace(PyObject *self, PyObject *str1,
10125 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 PyObject *u;
10128 char *sbuf = PyUnicode_DATA(self);
10129 char *buf1 = PyUnicode_DATA(str1);
10130 char *buf2 = PyUnicode_DATA(str2);
10131 int srelease = 0, release1 = 0, release2 = 0;
10132 int skind = PyUnicode_KIND(self);
10133 int kind1 = PyUnicode_KIND(str1);
10134 int kind2 = PyUnicode_KIND(str2);
10135 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10136 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10137 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010138 int mayshrink;
10139 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
10141 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010142 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010144 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
Victor Stinner59de0ee2011-10-07 10:01:28 +020010146 if (str1 == str2)
10147 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 if (skind < kind1)
10149 /* substring too wide to be present */
10150 goto nothing;
10151
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10153 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10154 /* Replacing str1 with str2 may cause a maxchar reduction in the
10155 result string. */
10156 mayshrink = (maxchar_str2 < maxchar);
10157 maxchar = Py_MAX(maxchar, maxchar_str2);
10158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010160 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010163 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010166 Py_UCS4 u1, u2;
10167 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010169 if (findchar(sbuf, PyUnicode_KIND(self),
10170 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010174 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010176 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 rkind = PyUnicode_KIND(u);
10178 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10179 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010180 if (--maxcount < 0)
10181 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010183 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010184 }
10185 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 int rkind = skind;
10187 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (kind1 < rkind) {
10190 /* widen substring */
10191 buf1 = _PyUnicode_AsKind(str1, rkind);
10192 if (!buf1) goto error;
10193 release1 = 1;
10194 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010195 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010196 if (i < 0)
10197 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (rkind > kind2) {
10199 /* widen replacement */
10200 buf2 = _PyUnicode_AsKind(str2, rkind);
10201 if (!buf2) goto error;
10202 release2 = 1;
10203 }
10204 else if (rkind < kind2) {
10205 /* widen self and buf1 */
10206 rkind = kind2;
10207 if (release1) PyMem_Free(buf1);
10208 sbuf = _PyUnicode_AsKind(self, rkind);
10209 if (!sbuf) goto error;
10210 srelease = 1;
10211 buf1 = _PyUnicode_AsKind(str1, rkind);
10212 if (!buf1) goto error;
10213 release1 = 1;
10214 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010215 u = PyUnicode_New(slen, maxchar);
10216 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 assert(PyUnicode_KIND(u) == rkind);
10219 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010220
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010222 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010225 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010227
10228 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010231 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010232 if (i == -1)
10233 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010234 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 Py_ssize_t n, i, j, ires;
10243 Py_ssize_t product, new_size;
10244 int rkind = skind;
10245 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010248 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 buf1 = _PyUnicode_AsKind(str1, rkind);
10250 if (!buf1) goto error;
10251 release1 = 1;
10252 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010253 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010254 if (n == 0)
10255 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 buf2 = _PyUnicode_AsKind(str2, rkind);
10259 if (!buf2) goto error;
10260 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010263 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 rkind = kind2;
10265 sbuf = _PyUnicode_AsKind(self, rkind);
10266 if (!sbuf) goto error;
10267 srelease = 1;
10268 if (release1) PyMem_Free(buf1);
10269 buf1 = _PyUnicode_AsKind(str1, rkind);
10270 if (!buf1) goto error;
10271 release1 = 1;
10272 }
10273 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10274 PyUnicode_GET_LENGTH(str1))); */
10275 product = n * (len2-len1);
10276 if ((product / (len2-len1)) != n) {
10277 PyErr_SetString(PyExc_OverflowError,
10278 "replace string is too long");
10279 goto error;
10280 }
10281 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010282 if (new_size == 0) {
10283 Py_INCREF(unicode_empty);
10284 u = unicode_empty;
10285 goto done;
10286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10288 PyErr_SetString(PyExc_OverflowError,
10289 "replace string is too long");
10290 goto error;
10291 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010292 u = PyUnicode_New(new_size, maxchar);
10293 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010295 assert(PyUnicode_KIND(u) == rkind);
10296 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 ires = i = 0;
10298 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 while (n-- > 0) {
10300 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010301 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010302 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010303 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010304 if (j == -1)
10305 break;
10306 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010308 memcpy(res + rkind * ires,
10309 sbuf + rkind * i,
10310 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 }
10313 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010315 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010317 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010324 memcpy(res + rkind * ires,
10325 sbuf + rkind * i,
10326 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010327 }
10328 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 /* interleave */
10330 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010333 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 if (--n <= 0)
10336 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010337 memcpy(res + rkind * ires,
10338 sbuf + rkind * i,
10339 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 ires++;
10341 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010342 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010343 memcpy(res + rkind * ires,
10344 sbuf + rkind * i,
10345 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010346 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010347 }
10348
10349 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010350 unicode_adjust_maxchar(&u);
10351 if (u == NULL)
10352 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010354
10355 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (srelease)
10357 PyMem_FREE(sbuf);
10358 if (release1)
10359 PyMem_FREE(buf1);
10360 if (release2)
10361 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010362 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010364
Benjamin Peterson29060642009-01-31 22:14:21 +000010365 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 if (srelease)
10368 PyMem_FREE(sbuf);
10369 if (release1)
10370 PyMem_FREE(buf1);
10371 if (release2)
10372 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010373 if (PyUnicode_CheckExact(self)) {
10374 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010376 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010377 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 error:
10379 if (srelease && sbuf)
10380 PyMem_FREE(sbuf);
10381 if (release1 && buf1)
10382 PyMem_FREE(buf1);
10383 if (release2 && buf2)
10384 PyMem_FREE(buf2);
10385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386}
10387
10388/* --- Unicode Object Methods --------------------------------------------- */
10389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010390PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010391 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392\n\
10393Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010394characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395
10396static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010397unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 return fixup(self, fixtitle);
10400}
10401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010402PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010403 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404\n\
10405Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010406have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
10408static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010409unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411 return fixup(self, fixcapitalize);
10412}
10413
10414#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010415PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417\n\
10418Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010419normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
10421static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010422unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423{
10424 PyObject *list;
10425 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010426 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428 /* Split into words */
10429 list = split(self, NULL, -1);
10430 if (!list)
10431 return NULL;
10432
10433 /* Capitalize each word */
10434 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010435 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437 if (item == NULL)
10438 goto onError;
10439 Py_DECREF(PyList_GET_ITEM(list, i));
10440 PyList_SET_ITEM(list, i, item);
10441 }
10442
10443 /* Join the words to form a new string */
10444 item = PyUnicode_Join(NULL, list);
10445
Benjamin Peterson29060642009-01-31 22:14:21 +000010446 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010448 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449}
10450#endif
10451
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010452/* Argument converter. Coerces to a single unicode character */
10453
10454static int
10455convert_uc(PyObject *obj, void *addr)
10456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010458 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010459
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 uniobj = PyUnicode_FromObject(obj);
10461 if (uniobj == NULL) {
10462 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010464 return 0;
10465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010468 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469 Py_DECREF(uniobj);
10470 return 0;
10471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010473 Py_DECREF(uniobj);
10474 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010475}
10476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010477PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010478 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010480Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010481done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482
10483static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010484unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010486 Py_ssize_t marg, left;
10487 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 Py_UCS4 fillchar = ' ';
10489
Victor Stinnere9a29352011-10-01 02:14:59 +020010490 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
Victor Stinnere9a29352011-10-01 02:14:59 +020010493 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 return NULL;
10495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010498 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499 }
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 left = marg / 2 + (marg & width & 1);
10503
Victor Stinner9310abb2011-10-05 00:59:23 +020010504 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505}
10506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507/* This function assumes that str1 and str2 are readied by the caller. */
10508
Marc-André Lemburge5034372000-08-08 08:04:29 +000010509static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010510unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010511{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 int kind1, kind2;
10513 void *data1, *data2;
10514 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 kind1 = PyUnicode_KIND(str1);
10517 kind2 = PyUnicode_KIND(str2);
10518 data1 = PyUnicode_DATA(str1);
10519 data2 = PyUnicode_DATA(str2);
10520 len1 = PyUnicode_GET_LENGTH(str1);
10521 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 for (i = 0; i < len1 && i < len2; ++i) {
10524 Py_UCS4 c1, c2;
10525 c1 = PyUnicode_READ(kind1, data1, i);
10526 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010527
10528 if (c1 != c2)
10529 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010530 }
10531
10532 return (len1 < len2) ? -1 : (len1 != len2);
10533}
10534
Alexander Belopolsky40018472011-02-26 01:02:56 +000010535int
10536PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10539 if (PyUnicode_READY(left) == -1 ||
10540 PyUnicode_READY(right) == -1)
10541 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010542 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010544 PyErr_Format(PyExc_TypeError,
10545 "Can't compare %.100s and %.100s",
10546 left->ob_type->tp_name,
10547 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 return -1;
10549}
10550
Martin v. Löwis5b222132007-06-10 09:51:05 +000010551int
10552PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 Py_ssize_t i;
10555 int kind;
10556 void *data;
10557 Py_UCS4 chr;
10558
Victor Stinner910337b2011-10-03 03:20:16 +020010559 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (PyUnicode_READY(uni) == -1)
10561 return -1;
10562 kind = PyUnicode_KIND(uni);
10563 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010564 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10566 if (chr != str[i])
10567 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010568 /* This check keeps Python strings that end in '\0' from comparing equal
10569 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010572 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010574 return 0;
10575}
10576
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010577
Benjamin Peterson29060642009-01-31 22:14:21 +000010578#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010580
Alexander Belopolsky40018472011-02-26 01:02:56 +000010581PyObject *
10582PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010583{
10584 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010585
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010586 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10587 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 if (PyUnicode_READY(left) == -1 ||
10589 PyUnicode_READY(right) == -1)
10590 return NULL;
10591 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10592 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010593 if (op == Py_EQ) {
10594 Py_INCREF(Py_False);
10595 return Py_False;
10596 }
10597 if (op == Py_NE) {
10598 Py_INCREF(Py_True);
10599 return Py_True;
10600 }
10601 }
10602 if (left == right)
10603 result = 0;
10604 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010605 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010606
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010607 /* Convert the return value to a Boolean */
10608 switch (op) {
10609 case Py_EQ:
10610 v = TEST_COND(result == 0);
10611 break;
10612 case Py_NE:
10613 v = TEST_COND(result != 0);
10614 break;
10615 case Py_LE:
10616 v = TEST_COND(result <= 0);
10617 break;
10618 case Py_GE:
10619 v = TEST_COND(result >= 0);
10620 break;
10621 case Py_LT:
10622 v = TEST_COND(result == -1);
10623 break;
10624 case Py_GT:
10625 v = TEST_COND(result == 1);
10626 break;
10627 default:
10628 PyErr_BadArgument();
10629 return NULL;
10630 }
10631 Py_INCREF(v);
10632 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010633 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010634
Brian Curtindfc80e32011-08-10 20:28:54 -050010635 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010636}
10637
Alexander Belopolsky40018472011-02-26 01:02:56 +000010638int
10639PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010640{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 int kind1, kind2, kind;
10643 void *buf1, *buf2;
10644 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010645 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010646
10647 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 sub = PyUnicode_FromObject(element);
10649 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 PyErr_Format(PyExc_TypeError,
10651 "'in <string>' requires string as left operand, not %s",
10652 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (PyUnicode_READY(sub) == -1)
10656 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010657
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010659 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 Py_DECREF(sub);
10661 return -1;
10662 }
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 kind1 = PyUnicode_KIND(str);
10665 kind2 = PyUnicode_KIND(sub);
10666 kind = kind1 > kind2 ? kind1 : kind2;
10667 buf1 = PyUnicode_DATA(str);
10668 buf2 = PyUnicode_DATA(sub);
10669 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010670 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (!buf1) {
10672 Py_DECREF(sub);
10673 return -1;
10674 }
10675 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010676 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (!buf2) {
10678 Py_DECREF(sub);
10679 if (kind1 != kind) PyMem_Free(buf1);
10680 return -1;
10681 }
10682 len1 = PyUnicode_GET_LENGTH(str);
10683 len2 = PyUnicode_GET_LENGTH(sub);
10684
10685 switch(kind) {
10686 case PyUnicode_1BYTE_KIND:
10687 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10688 break;
10689 case PyUnicode_2BYTE_KIND:
10690 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10691 break;
10692 case PyUnicode_4BYTE_KIND:
10693 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10694 break;
10695 default:
10696 result = -1;
10697 assert(0);
10698 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699
10700 Py_DECREF(str);
10701 Py_DECREF(sub);
10702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 if (kind1 != kind)
10704 PyMem_Free(buf1);
10705 if (kind2 != kind)
10706 PyMem_Free(buf2);
10707
Guido van Rossum403d68b2000-03-13 15:55:09 +000010708 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010709}
10710
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711/* Concat to string or Unicode object giving a new Unicode object. */
10712
Alexander Belopolsky40018472011-02-26 01:02:56 +000010713PyObject *
10714PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010717 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
10719 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010728 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010732 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010733 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735 }
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010738 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10739 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 w = PyUnicode_New(
10743 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10744 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010747 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10748 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 Py_DECREF(u);
10750 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010751 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 Py_XDECREF(u);
10756 Py_XDECREF(v);
10757 return NULL;
10758}
10759
Victor Stinnerb0923652011-10-04 01:17:31 +020010760static void
10761unicode_append_inplace(PyObject **p_left, PyObject *right)
10762{
10763 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010764
10765 assert(PyUnicode_IS_READY(*p_left));
10766 assert(PyUnicode_IS_READY(right));
10767
10768 left_len = PyUnicode_GET_LENGTH(*p_left);
10769 right_len = PyUnicode_GET_LENGTH(right);
10770 if (left_len > PY_SSIZE_T_MAX - right_len) {
10771 PyErr_SetString(PyExc_OverflowError,
10772 "strings are too large to concat");
10773 goto error;
10774 }
10775 new_len = left_len + right_len;
10776
10777 /* Now we own the last reference to 'left', so we can resize it
10778 * in-place.
10779 */
10780 if (unicode_resize(p_left, new_len) != 0) {
10781 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10782 * deallocated so it cannot be put back into
10783 * 'variable'. The MemoryError is raised when there
10784 * is no value in 'variable', which might (very
10785 * remotely) be a cause of incompatibilities.
10786 */
10787 goto error;
10788 }
10789 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010790 copy_characters(*p_left, left_len, right, 0, right_len);
10791 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010792 return;
10793
10794error:
10795 Py_DECREF(*p_left);
10796 *p_left = NULL;
10797}
10798
Walter Dörwald1ab83302007-05-18 17:15:44 +000010799void
Victor Stinner23e56682011-10-03 03:54:37 +020010800PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010801{
Victor Stinner23e56682011-10-03 03:54:37 +020010802 PyObject *left, *res;
10803
10804 if (p_left == NULL) {
10805 if (!PyErr_Occurred())
10806 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010807 return;
10808 }
Victor Stinner23e56682011-10-03 03:54:37 +020010809 left = *p_left;
10810 if (right == NULL || !PyUnicode_Check(left)) {
10811 if (!PyErr_Occurred())
10812 PyErr_BadInternalCall();
10813 goto error;
10814 }
10815
Victor Stinnere1335c72011-10-04 20:53:03 +020010816 if (PyUnicode_READY(left))
10817 goto error;
10818 if (PyUnicode_READY(right))
10819 goto error;
10820
Victor Stinner23e56682011-10-03 03:54:37 +020010821 if (PyUnicode_CheckExact(left) && left != unicode_empty
10822 && PyUnicode_CheckExact(right) && right != unicode_empty
10823 && unicode_resizable(left)
10824 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10825 || _PyUnicode_WSTR(left) != NULL))
10826 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010827 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10828 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010829 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010830 not so different than duplicating the string. */
10831 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010832 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010833 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010834 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010835 return;
10836 }
10837 }
10838
10839 res = PyUnicode_Concat(left, right);
10840 if (res == NULL)
10841 goto error;
10842 Py_DECREF(left);
10843 *p_left = res;
10844 return;
10845
10846error:
10847 Py_DECREF(*p_left);
10848 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010849}
10850
10851void
10852PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10853{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010854 PyUnicode_Append(pleft, right);
10855 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010856}
10857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010858PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010861Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010862string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010863interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
10865static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010866unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010868 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010869 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010870 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 int kind1, kind2, kind;
10873 void *buf1, *buf2;
10874 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875
Jesus Ceaac451502011-04-20 17:09:23 +020010876 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10877 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 kind1 = PyUnicode_KIND(self);
10881 kind2 = PyUnicode_KIND(substring);
10882 kind = kind1 > kind2 ? kind1 : kind2;
10883 buf1 = PyUnicode_DATA(self);
10884 buf2 = PyUnicode_DATA(substring);
10885 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010886 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 if (!buf1) {
10888 Py_DECREF(substring);
10889 return NULL;
10890 }
10891 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010892 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 if (!buf2) {
10894 Py_DECREF(substring);
10895 if (kind1 != kind) PyMem_Free(buf1);
10896 return NULL;
10897 }
10898 len1 = PyUnicode_GET_LENGTH(self);
10899 len2 = PyUnicode_GET_LENGTH(substring);
10900
10901 ADJUST_INDICES(start, end, len1);
10902 switch(kind) {
10903 case PyUnicode_1BYTE_KIND:
10904 iresult = ucs1lib_count(
10905 ((Py_UCS1*)buf1) + start, end - start,
10906 buf2, len2, PY_SSIZE_T_MAX
10907 );
10908 break;
10909 case PyUnicode_2BYTE_KIND:
10910 iresult = ucs2lib_count(
10911 ((Py_UCS2*)buf1) + start, end - start,
10912 buf2, len2, PY_SSIZE_T_MAX
10913 );
10914 break;
10915 case PyUnicode_4BYTE_KIND:
10916 iresult = ucs4lib_count(
10917 ((Py_UCS4*)buf1) + start, end - start,
10918 buf2, len2, PY_SSIZE_T_MAX
10919 );
10920 break;
10921 default:
10922 assert(0); iresult = 0;
10923 }
10924
10925 result = PyLong_FromSsize_t(iresult);
10926
10927 if (kind1 != kind)
10928 PyMem_Free(buf1);
10929 if (kind2 != kind)
10930 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
10932 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010933
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 return result;
10935}
10936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010937PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010938 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010940Encode S using the codec registered for encoding. Default encoding\n\
10941is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010942handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010943a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10944'xmlcharrefreplace' as well as any other name registered with\n\
10945codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010948unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010950 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951 char *encoding = NULL;
10952 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010953
Benjamin Peterson308d6372009-09-18 21:42:35 +000010954 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10955 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010957 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010958}
10959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010960PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010961 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962\n\
10963Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010964If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
10966static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010967unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010969 Py_ssize_t i, j, line_pos, src_len, incr;
10970 Py_UCS4 ch;
10971 PyObject *u;
10972 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010974 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010975 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
10977 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
Antoine Pitrou22425222011-10-04 19:10:51 +020010980 if (PyUnicode_READY(self) == -1)
10981 return NULL;
10982
Thomas Wouters7e474022000-07-16 12:04:32 +000010983 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010984 src_len = PyUnicode_GET_LENGTH(self);
10985 i = j = line_pos = 0;
10986 kind = PyUnicode_KIND(self);
10987 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010988 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 for (; i < src_len; i++) {
10990 ch = PyUnicode_READ(kind, src_data, i);
10991 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010992 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 goto overflow;
10997 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010999 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011003 goto overflow;
11004 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 if (ch == '\n' || ch == '\r')
11007 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011009 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011010 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011011 Py_INCREF(self);
11012 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011013 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011014
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 if (!u)
11018 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011019 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
Antoine Pitroue71d5742011-10-04 15:55:09 +020011023 for (; i < src_len; i++) {
11024 ch = PyUnicode_READ(kind, src_data, i);
11025 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011027 incr = tabsize - (line_pos % tabsize);
11028 line_pos += incr;
11029 while (incr--) {
11030 PyUnicode_WRITE(kind, dest_data, j, ' ');
11031 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011032 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011034 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 line_pos++;
11037 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011038 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 if (ch == '\n' || ch == '\r')
11040 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011042 }
11043 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011044 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011045
Antoine Pitroue71d5742011-10-04 15:55:09 +020011046 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011047 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11048 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049}
11050
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053\n\
11054Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011055such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056arguments start and end are interpreted as in slice notation.\n\
11057\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011058Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
11060static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011063 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011064 Py_ssize_t start;
11065 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Jesus Ceaac451502011-04-20 17:09:23 +020011068 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11069 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 if (PyUnicode_READY(self) == -1)
11073 return NULL;
11074 if (PyUnicode_READY(substring) == -1)
11075 return NULL;
11076
Victor Stinner7931d9a2011-11-04 00:22:48 +010011077 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078
11079 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (result == -2)
11082 return NULL;
11083
Christian Heimes217cfd12007-12-02 14:31:20 +000011084 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085}
11086
11087static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011088unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011090 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11091 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094}
11095
Guido van Rossumc2504932007-09-18 19:42:40 +000011096/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011097 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011098static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011099unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100{
Guido van Rossumc2504932007-09-18 19:42:40 +000011101 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011102 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 if (_PyUnicode_HASH(self) != -1)
11105 return _PyUnicode_HASH(self);
11106 if (PyUnicode_READY(self) == -1)
11107 return -1;
11108 len = PyUnicode_GET_LENGTH(self);
11109
11110 /* The hash function as a macro, gets expanded three times below. */
11111#define HASH(P) \
11112 x = (Py_uhash_t)*P << 7; \
11113 while (--len >= 0) \
11114 x = (1000003*x) ^ (Py_uhash_t)*P++;
11115
11116 switch (PyUnicode_KIND(self)) {
11117 case PyUnicode_1BYTE_KIND: {
11118 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11119 HASH(c);
11120 break;
11121 }
11122 case PyUnicode_2BYTE_KIND: {
11123 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11124 HASH(s);
11125 break;
11126 }
11127 default: {
11128 Py_UCS4 *l;
11129 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11130 "Impossible switch case in unicode_hash");
11131 l = PyUnicode_4BYTE_DATA(self);
11132 HASH(l);
11133 break;
11134 }
11135 }
11136 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11137
Guido van Rossumc2504932007-09-18 19:42:40 +000011138 if (x == -1)
11139 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011140 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011141 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011145PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011148Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
11150static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011153 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011154 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011155 Py_ssize_t start;
11156 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
Jesus Ceaac451502011-04-20 17:09:23 +020011158 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11159 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 if (PyUnicode_READY(self) == -1)
11163 return NULL;
11164 if (PyUnicode_READY(substring) == -1)
11165 return NULL;
11166
Victor Stinner7931d9a2011-11-04 00:22:48 +010011167 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168
11169 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (result == -2)
11172 return NULL;
11173
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 if (result < 0) {
11175 PyErr_SetString(PyExc_ValueError, "substring not found");
11176 return NULL;
11177 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178
Christian Heimes217cfd12007-12-02 14:31:20 +000011179 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180}
11181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011182PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011185Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011189unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 Py_ssize_t i, length;
11192 int kind;
11193 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 int cased;
11195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (PyUnicode_READY(self) == -1)
11197 return NULL;
11198 length = PyUnicode_GET_LENGTH(self);
11199 kind = PyUnicode_KIND(self);
11200 data = PyUnicode_DATA(self);
11201
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 if (length == 1)
11204 return PyBool_FromLong(
11205 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011207 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011210
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 for (i = 0; i < length; i++) {
11213 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011214
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11216 return PyBool_FromLong(0);
11217 else if (!cased && Py_UNICODE_ISLOWER(ch))
11218 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011220 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221}
11222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011226Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228
11229static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011230unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 Py_ssize_t i, length;
11233 int kind;
11234 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235 int cased;
11236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (PyUnicode_READY(self) == -1)
11238 return NULL;
11239 length = PyUnicode_GET_LENGTH(self);
11240 kind = PyUnicode_KIND(self);
11241 data = PyUnicode_DATA(self);
11242
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 if (length == 1)
11245 return PyBool_FromLong(
11246 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011248 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011251
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 for (i = 0; i < length; i++) {
11254 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011255
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11257 return PyBool_FromLong(0);
11258 else if (!cased && Py_UNICODE_ISUPPER(ch))
11259 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011261 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262}
11263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011264PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011267Return True if S is a titlecased string and there is at least one\n\
11268character in S, i.e. upper- and titlecase characters may only\n\
11269follow uncased characters and lowercase characters only cased ones.\n\
11270Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271
11272static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011273unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 Py_ssize_t i, length;
11276 int kind;
11277 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 int cased, previous_is_cased;
11279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (PyUnicode_READY(self) == -1)
11281 return NULL;
11282 length = PyUnicode_GET_LENGTH(self);
11283 kind = PyUnicode_KIND(self);
11284 data = PyUnicode_DATA(self);
11285
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 if (length == 1) {
11288 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11289 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11290 (Py_UNICODE_ISUPPER(ch) != 0));
11291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011293 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 cased = 0;
11298 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 for (i = 0; i < length; i++) {
11300 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011301
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11303 if (previous_is_cased)
11304 return PyBool_FromLong(0);
11305 previous_is_cased = 1;
11306 cased = 1;
11307 }
11308 else if (Py_UNICODE_ISLOWER(ch)) {
11309 if (!previous_is_cased)
11310 return PyBool_FromLong(0);
11311 previous_is_cased = 1;
11312 cased = 1;
11313 }
11314 else
11315 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011317 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318}
11319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011320PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011323Return True if all characters in S are whitespace\n\
11324and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
11326static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 Py_ssize_t i, length;
11330 int kind;
11331 void *data;
11332
11333 if (PyUnicode_READY(self) == -1)
11334 return NULL;
11335 length = PyUnicode_GET_LENGTH(self);
11336 kind = PyUnicode_KIND(self);
11337 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (length == 1)
11341 return PyBool_FromLong(
11342 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011344 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 for (i = 0; i < length; i++) {
11349 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011350 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354}
11355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011358\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011359Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011360and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011361
11362static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011363unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 Py_ssize_t i, length;
11366 int kind;
11367 void *data;
11368
11369 if (PyUnicode_READY(self) == -1)
11370 return NULL;
11371 length = PyUnicode_GET_LENGTH(self);
11372 kind = PyUnicode_KIND(self);
11373 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011374
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011375 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 1)
11377 return PyBool_FromLong(
11378 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011379
11380 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011381 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 for (i = 0; i < length; i++) {
11385 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011386 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011388 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389}
11390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011393\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011394Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011395and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011396
11397static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011398unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 int kind;
11401 void *data;
11402 Py_ssize_t len, i;
11403
11404 if (PyUnicode_READY(self) == -1)
11405 return NULL;
11406
11407 kind = PyUnicode_KIND(self);
11408 data = PyUnicode_DATA(self);
11409 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011410
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011411 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (len == 1) {
11413 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11414 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11415 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011416
11417 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 for (i = 0; i < len; i++) {
11422 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011423 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011424 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011425 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011426 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011432Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011436unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 Py_ssize_t i, length;
11439 int kind;
11440 void *data;
11441
11442 if (PyUnicode_READY(self) == -1)
11443 return NULL;
11444 length = PyUnicode_GET_LENGTH(self);
11445 kind = PyUnicode_KIND(self);
11446 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 if (length == 1)
11450 return PyBool_FromLong(
11451 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011453 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 for (i = 0; i < length; i++) {
11458 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011459 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011461 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462}
11463
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011464PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011467Return True if all characters in S are digits\n\
11468and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469
11470static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011471unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 Py_ssize_t i, length;
11474 int kind;
11475 void *data;
11476
11477 if (PyUnicode_READY(self) == -1)
11478 return NULL;
11479 length = PyUnicode_GET_LENGTH(self);
11480 kind = PyUnicode_KIND(self);
11481 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 if (length == 1) {
11485 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11486 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11487 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011489 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 for (i = 0; i < length; i++) {
11494 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011497 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498}
11499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011503Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011504False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
11506static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011507unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 Py_ssize_t i, length;
11510 int kind;
11511 void *data;
11512
11513 if (PyUnicode_READY(self) == -1)
11514 return NULL;
11515 length = PyUnicode_GET_LENGTH(self);
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (length == 1)
11521 return PyBool_FromLong(
11522 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011524 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 for (i = 0; i < length; i++) {
11529 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011532 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
Martin v. Löwis47383402007-08-15 07:32:56 +000011535int
11536PyUnicode_IsIdentifier(PyObject *self)
11537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 int kind;
11539 void *data;
11540 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011541 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (PyUnicode_READY(self) == -1) {
11544 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 }
11547
11548 /* Special case for empty strings */
11549 if (PyUnicode_GET_LENGTH(self) == 0)
11550 return 0;
11551 kind = PyUnicode_KIND(self);
11552 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011553
11554 /* PEP 3131 says that the first character must be in
11555 XID_Start and subsequent characters in XID_Continue,
11556 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011557 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011558 letters, digits, underscore). However, given the current
11559 definition of XID_Start and XID_Continue, it is sufficient
11560 to check just for these, except that _ must be allowed
11561 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011563 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011564 return 0;
11565
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011566 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011569 return 1;
11570}
11571
11572PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011573 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011574\n\
11575Return True if S is a valid identifier according\n\
11576to the language definition.");
11577
11578static PyObject*
11579unicode_isidentifier(PyObject *self)
11580{
11581 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11582}
11583
Georg Brandl559e5d72008-06-11 18:37:52 +000011584PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011586\n\
11587Return True if all characters in S are considered\n\
11588printable in repr() or S is empty, False otherwise.");
11589
11590static PyObject*
11591unicode_isprintable(PyObject *self)
11592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 Py_ssize_t i, length;
11594 int kind;
11595 void *data;
11596
11597 if (PyUnicode_READY(self) == -1)
11598 return NULL;
11599 length = PyUnicode_GET_LENGTH(self);
11600 kind = PyUnicode_KIND(self);
11601 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011602
11603 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 if (length == 1)
11605 return PyBool_FromLong(
11606 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 for (i = 0; i < length; i++) {
11609 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011610 Py_RETURN_FALSE;
11611 }
11612 }
11613 Py_RETURN_TRUE;
11614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011617 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618\n\
11619Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011620iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
11622static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011623unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011625 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626}
11627
Martin v. Löwis18e16552006-02-15 17:27:45 +000011628static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011629unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (PyUnicode_READY(self) == -1)
11632 return -1;
11633 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011639Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011640done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641
11642static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011643unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011645 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 Py_UCS4 fillchar = ' ';
11647
11648 if (PyUnicode_READY(self) == -1)
11649 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011650
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011651 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 return NULL;
11653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011656 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 }
11658
Victor Stinner7931d9a2011-11-04 00:22:48 +010011659 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011665Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666
11667static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011668unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670 return fixup(self, fixlower);
11671}
11672
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673#define LEFTSTRIP 0
11674#define RIGHTSTRIP 1
11675#define BOTHSTRIP 2
11676
11677/* Arrays indexed by above */
11678static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11679
11680#define STRIPNAME(i) (stripformat[i]+3)
11681
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682/* externally visible for str.strip(unicode) */
11683PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011684_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 void *data;
11687 int kind;
11688 Py_ssize_t i, j, len;
11689 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11692 return NULL;
11693
11694 kind = PyUnicode_KIND(self);
11695 data = PyUnicode_DATA(self);
11696 len = PyUnicode_GET_LENGTH(self);
11697 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11698 PyUnicode_DATA(sepobj),
11699 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011700
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 i = 0;
11702 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 while (i < len &&
11704 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 i++;
11706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011707 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 j = len;
11710 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 do {
11712 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 } while (j >= i &&
11714 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
Victor Stinner7931d9a2011-11-04 00:22:48 +010011718 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719}
11720
11721PyObject*
11722PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11723{
11724 unsigned char *data;
11725 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727
Victor Stinnerde636f32011-10-01 03:55:54 +020011728 if (PyUnicode_READY(self) == -1)
11729 return NULL;
11730
11731 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11732
Victor Stinner12bab6d2011-10-01 01:53:49 +020011733 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011735 if (PyUnicode_CheckExact(self)) {
11736 Py_INCREF(self);
11737 return self;
11738 }
11739 else
11740 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 }
11742
Victor Stinner12bab6d2011-10-01 01:53:49 +020011743 length = end - start;
11744 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011745 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746
Victor Stinnerde636f32011-10-01 03:55:54 +020011747 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011748 PyErr_SetString(PyExc_IndexError, "string index out of range");
11749 return NULL;
11750 }
11751
Victor Stinnerb9275c12011-10-05 14:01:42 +020011752 if (PyUnicode_IS_ASCII(self)) {
11753 kind = PyUnicode_KIND(self);
11754 data = PyUnicode_1BYTE_DATA(self);
11755 return unicode_fromascii(data + start, length);
11756 }
11757 else {
11758 kind = PyUnicode_KIND(self);
11759 data = PyUnicode_1BYTE_DATA(self);
11760 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011761 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011762 length);
11763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
11766static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011767do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 int kind;
11770 void *data;
11771 Py_ssize_t len, i, j;
11772
11773 if (PyUnicode_READY(self) == -1)
11774 return NULL;
11775
11776 kind = PyUnicode_KIND(self);
11777 data = PyUnicode_DATA(self);
11778 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 i = 0;
11781 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 i++;
11784 }
11785 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786
Benjamin Peterson14339b62009-01-31 16:36:08 +000011787 j = len;
11788 if (striptype != LEFTSTRIP) {
11789 do {
11790 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 j++;
11793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794
Victor Stinner7931d9a2011-11-04 00:22:48 +010011795 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798
11799static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011800do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011801{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011802 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011803
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11805 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806
Benjamin Peterson14339b62009-01-31 16:36:08 +000011807 if (sep != NULL && sep != Py_None) {
11808 if (PyUnicode_Check(sep))
11809 return _PyUnicode_XStrip(self, striptype, sep);
11810 else {
11811 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 "%s arg must be None or str",
11813 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 return NULL;
11815 }
11816 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011819}
11820
11821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011822PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824\n\
11825Return a copy of the string S with leading and trailing\n\
11826whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011827If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828
11829static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011830unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011832 if (PyTuple_GET_SIZE(args) == 0)
11833 return do_strip(self, BOTHSTRIP); /* Common case */
11834 else
11835 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836}
11837
11838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011839PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011841\n\
11842Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011843If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844
11845static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011847{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011848 if (PyTuple_GET_SIZE(args) == 0)
11849 return do_strip(self, LEFTSTRIP); /* Common case */
11850 else
11851 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852}
11853
11854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011855PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011857\n\
11858Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011859If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011860
11861static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011862unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011863{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011864 if (PyTuple_GET_SIZE(args) == 0)
11865 return do_strip(self, RIGHTSTRIP); /* Common case */
11866 else
11867 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011868}
11869
11870
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011872unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011874 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
Georg Brandl222de0f2009-04-12 12:01:50 +000011877 if (len < 1) {
11878 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011879 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
Tim Peters7a29bd52001-09-12 03:03:31 +000011882 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 /* no repeat, return original string */
11884 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011885 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 }
Tim Peters8f422462000-09-09 06:13:41 +000011887
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (PyUnicode_READY(str) == -1)
11889 return NULL;
11890
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011891 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011892 PyErr_SetString(PyExc_OverflowError,
11893 "repeated string is too long");
11894 return NULL;
11895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011897
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011898 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 if (!u)
11900 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011901 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 if (PyUnicode_GET_LENGTH(str) == 1) {
11904 const int kind = PyUnicode_KIND(str);
11905 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11906 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011907 if (kind == PyUnicode_1BYTE_KIND)
11908 memset(to, (unsigned char)fill_char, len);
11909 else {
11910 for (n = 0; n < len; ++n)
11911 PyUnicode_WRITE(kind, to, n, fill_char);
11912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 }
11914 else {
11915 /* number of characters copied this far */
11916 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011917 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 char *to = (char *) PyUnicode_DATA(u);
11919 Py_MEMCPY(to, PyUnicode_DATA(str),
11920 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 n = (done <= nchars-done) ? done : nchars-done;
11923 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011924 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 }
11927
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011928 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011929 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930}
11931
Alexander Belopolsky40018472011-02-26 01:02:56 +000011932PyObject *
11933PyUnicode_Replace(PyObject *obj,
11934 PyObject *subobj,
11935 PyObject *replobj,
11936 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
11938 PyObject *self;
11939 PyObject *str1;
11940 PyObject *str2;
11941 PyObject *result;
11942
11943 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011944 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011947 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011948 Py_DECREF(self);
11949 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 }
11951 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011952 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 Py_DECREF(self);
11954 Py_DECREF(str1);
11955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 Py_DECREF(self);
11959 Py_DECREF(str1);
11960 Py_DECREF(str2);
11961 return result;
11962}
11963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011964PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011965 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966\n\
11967Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011968old replaced by new. If the optional argument count is\n\
11969given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
11971static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 PyObject *str1;
11975 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011976 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 PyObject *result;
11978
Martin v. Löwis18e16552006-02-15 17:27:45 +000011979 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 str1 = PyUnicode_FromObject(str1);
11984 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11985 return NULL;
11986 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011987 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 Py_DECREF(str1);
11989 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
11992 result = replace(self, str1, str2, maxcount);
11993
11994 Py_DECREF(str1);
11995 Py_DECREF(str2);
11996 return result;
11997}
11998
Alexander Belopolsky40018472011-02-26 01:02:56 +000011999static PyObject *
12000unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012002 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 Py_ssize_t isize;
12004 Py_ssize_t osize, squote, dquote, i, o;
12005 Py_UCS4 max, quote;
12006 int ikind, okind;
12007 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 return NULL;
12011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 isize = PyUnicode_GET_LENGTH(unicode);
12013 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 /* Compute length of output, quote characters, and
12016 maximum character */
12017 osize = 2; /* quotes */
12018 max = 127;
12019 squote = dquote = 0;
12020 ikind = PyUnicode_KIND(unicode);
12021 for (i = 0; i < isize; i++) {
12022 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12023 switch (ch) {
12024 case '\'': squote++; osize++; break;
12025 case '"': dquote++; osize++; break;
12026 case '\\': case '\t': case '\r': case '\n':
12027 osize += 2; break;
12028 default:
12029 /* Fast-path ASCII */
12030 if (ch < ' ' || ch == 0x7f)
12031 osize += 4; /* \xHH */
12032 else if (ch < 0x7f)
12033 osize++;
12034 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12035 osize++;
12036 max = ch > max ? ch : max;
12037 }
12038 else if (ch < 0x100)
12039 osize += 4; /* \xHH */
12040 else if (ch < 0x10000)
12041 osize += 6; /* \uHHHH */
12042 else
12043 osize += 10; /* \uHHHHHHHH */
12044 }
12045 }
12046
12047 quote = '\'';
12048 if (squote) {
12049 if (dquote)
12050 /* Both squote and dquote present. Use squote,
12051 and escape them */
12052 osize += squote;
12053 else
12054 quote = '"';
12055 }
12056
12057 repr = PyUnicode_New(osize, max);
12058 if (repr == NULL)
12059 return NULL;
12060 okind = PyUnicode_KIND(repr);
12061 odata = PyUnicode_DATA(repr);
12062
12063 PyUnicode_WRITE(okind, odata, 0, quote);
12064 PyUnicode_WRITE(okind, odata, osize-1, quote);
12065
12066 for (i = 0, o = 1; i < isize; i++) {
12067 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012068
12069 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 if ((ch == quote) || (ch == '\\')) {
12071 PyUnicode_WRITE(okind, odata, o++, '\\');
12072 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012073 continue;
12074 }
12075
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012077 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyUnicode_WRITE(okind, odata, o++, '\\');
12079 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012080 }
12081 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 PyUnicode_WRITE(okind, odata, o++, '\\');
12083 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012084 }
12085 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 PyUnicode_WRITE(okind, odata, o++, '\\');
12087 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012088 }
12089
12090 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012091 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 PyUnicode_WRITE(okind, odata, o++, '\\');
12093 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012094 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012096 }
12097
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 /* Copy ASCII characters as-is */
12099 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012101 }
12102
Benjamin Peterson29060642009-01-31 22:14:21 +000012103 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012104 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012105 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 (categories Z* and C* except ASCII space)
12107 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012109 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 if (ch <= 0xff) {
12111 PyUnicode_WRITE(okind, odata, o++, '\\');
12112 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012115 }
12116 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 else if (ch >= 0x10000) {
12118 PyUnicode_WRITE(okind, odata, o++, '\\');
12119 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012128 }
12129 /* Map 16-bit characters to '\uxxxx' */
12130 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 PyUnicode_WRITE(okind, odata, o++, '\\');
12132 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12136 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 }
12138 }
12139 /* Copy characters as-is */
12140 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012142 }
12143 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012146 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148}
12149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012150PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152\n\
12153Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012154such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155arguments start and end are interpreted as in slice notation.\n\
12156\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012157Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
12159static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012160unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012162 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012163 Py_ssize_t start;
12164 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012165 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
Jesus Ceaac451502011-04-20 17:09:23 +020012167 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12168 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 if (PyUnicode_READY(self) == -1)
12172 return NULL;
12173 if (PyUnicode_READY(substring) == -1)
12174 return NULL;
12175
Victor Stinner7931d9a2011-11-04 00:22:48 +010012176 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177
12178 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 if (result == -2)
12181 return NULL;
12182
Christian Heimes217cfd12007-12-02 14:31:20 +000012183 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184}
12185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012186PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012187 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012189Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
12191static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012194 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012195 Py_ssize_t start;
12196 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012197 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Jesus Ceaac451502011-04-20 17:09:23 +020012199 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12200 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 if (PyUnicode_READY(self) == -1)
12204 return NULL;
12205 if (PyUnicode_READY(substring) == -1)
12206 return NULL;
12207
Victor Stinner7931d9a2011-11-04 00:22:48 +010012208 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
12210 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 if (result == -2)
12213 return NULL;
12214
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215 if (result < 0) {
12216 PyErr_SetString(PyExc_ValueError, "substring not found");
12217 return NULL;
12218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219
Christian Heimes217cfd12007-12-02 14:31:20 +000012220 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221}
12222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012223PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012224 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012226Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012227done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228
12229static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012230unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012232 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 Py_UCS4 fillchar = ' ';
12234
Victor Stinnere9a29352011-10-01 02:14:59 +020012235 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012237
Victor Stinnere9a29352011-10-01 02:14:59 +020012238 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239 return NULL;
12240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012243 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 }
12245
Victor Stinner7931d9a2011-11-04 00:22:48 +010012246 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247}
12248
Alexander Belopolsky40018472011-02-26 01:02:56 +000012249PyObject *
12250PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
12252 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012253
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 s = PyUnicode_FromObject(s);
12255 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012256 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 if (sep != NULL) {
12258 sep = PyUnicode_FromObject(sep);
12259 if (sep == NULL) {
12260 Py_DECREF(s);
12261 return NULL;
12262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263 }
12264
Victor Stinner9310abb2011-10-05 00:59:23 +020012265 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
12267 Py_DECREF(s);
12268 Py_XDECREF(sep);
12269 return result;
12270}
12271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012272PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274\n\
12275Return a list of the words in S, using sep as the\n\
12276delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012277splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012278whitespace string is a separator and empty strings are\n\
12279removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280
12281static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012282unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283{
12284 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012285 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
Martin v. Löwis18e16552006-02-15 17:27:45 +000012287 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 return NULL;
12289
12290 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012293 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012295 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296}
12297
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298PyObject *
12299PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12300{
12301 PyObject* str_obj;
12302 PyObject* sep_obj;
12303 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 int kind1, kind2, kind;
12305 void *buf1 = NULL, *buf2 = NULL;
12306 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012307
12308 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012309 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012310 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313 Py_DECREF(str_obj);
12314 return NULL;
12315 }
12316
Victor Stinner14f8f022011-10-05 20:58:25 +020012317 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012319 kind = Py_MAX(kind1, kind2);
12320 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012322 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (!buf1)
12324 goto onError;
12325 buf2 = PyUnicode_DATA(sep_obj);
12326 if (kind2 != kind)
12327 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12328 if (!buf2)
12329 goto onError;
12330 len1 = PyUnicode_GET_LENGTH(str_obj);
12331 len2 = PyUnicode_GET_LENGTH(sep_obj);
12332
Victor Stinner14f8f022011-10-05 20:58:25 +020012333 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012335 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12336 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12337 else
12338 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 break;
12340 case PyUnicode_2BYTE_KIND:
12341 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12342 break;
12343 case PyUnicode_4BYTE_KIND:
12344 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12345 break;
12346 default:
12347 assert(0);
12348 out = 0;
12349 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 Py_DECREF(sep_obj);
12352 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 if (kind1 != kind)
12354 PyMem_Free(buf1);
12355 if (kind2 != kind)
12356 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357
12358 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 onError:
12360 Py_DECREF(sep_obj);
12361 Py_DECREF(str_obj);
12362 if (kind1 != kind && buf1)
12363 PyMem_Free(buf1);
12364 if (kind2 != kind && buf2)
12365 PyMem_Free(buf2);
12366 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012367}
12368
12369
12370PyObject *
12371PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12372{
12373 PyObject* str_obj;
12374 PyObject* sep_obj;
12375 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 int kind1, kind2, kind;
12377 void *buf1 = NULL, *buf2 = NULL;
12378 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379
12380 str_obj = PyUnicode_FromObject(str_in);
12381 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012382 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383 sep_obj = PyUnicode_FromObject(sep_in);
12384 if (!sep_obj) {
12385 Py_DECREF(str_obj);
12386 return NULL;
12387 }
12388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 kind1 = PyUnicode_KIND(str_in);
12390 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012391 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 buf1 = PyUnicode_DATA(str_in);
12393 if (kind1 != kind)
12394 buf1 = _PyUnicode_AsKind(str_in, kind);
12395 if (!buf1)
12396 goto onError;
12397 buf2 = PyUnicode_DATA(sep_obj);
12398 if (kind2 != kind)
12399 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12400 if (!buf2)
12401 goto onError;
12402 len1 = PyUnicode_GET_LENGTH(str_obj);
12403 len2 = PyUnicode_GET_LENGTH(sep_obj);
12404
12405 switch(PyUnicode_KIND(str_in)) {
12406 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012407 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12408 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12409 else
12410 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 break;
12412 case PyUnicode_2BYTE_KIND:
12413 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12414 break;
12415 case PyUnicode_4BYTE_KIND:
12416 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12417 break;
12418 default:
12419 assert(0);
12420 out = 0;
12421 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422
12423 Py_DECREF(sep_obj);
12424 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (kind1 != kind)
12426 PyMem_Free(buf1);
12427 if (kind2 != kind)
12428 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012429
12430 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 onError:
12432 Py_DECREF(sep_obj);
12433 Py_DECREF(str_obj);
12434 if (kind1 != kind && buf1)
12435 PyMem_Free(buf1);
12436 if (kind2 != kind && buf2)
12437 PyMem_Free(buf2);
12438 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439}
12440
12441PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012444Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012446found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447
12448static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012449unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012450{
Victor Stinner9310abb2011-10-05 00:59:23 +020012451 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452}
12453
12454PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012455 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012457Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012459separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012462unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012463{
Victor Stinner9310abb2011-10-05 00:59:23 +020012464 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012465}
12466
Alexander Belopolsky40018472011-02-26 01:02:56 +000012467PyObject *
12468PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012469{
12470 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012472 s = PyUnicode_FromObject(s);
12473 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012474 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 if (sep != NULL) {
12476 sep = PyUnicode_FromObject(sep);
12477 if (sep == NULL) {
12478 Py_DECREF(s);
12479 return NULL;
12480 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481 }
12482
Victor Stinner9310abb2011-10-05 00:59:23 +020012483 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484
12485 Py_DECREF(s);
12486 Py_XDECREF(sep);
12487 return result;
12488}
12489
12490PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492\n\
12493Return a list of the words in S, using sep as the\n\
12494delimiter string, starting at the end of the string and\n\
12495working to the front. If maxsplit is given, at most maxsplit\n\
12496splits are done. If sep is not specified, any whitespace string\n\
12497is a separator.");
12498
12499static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012500unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012501{
12502 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012503 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012504
Martin v. Löwis18e16552006-02-15 17:27:45 +000012505 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012506 return NULL;
12507
12508 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012509 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012510 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012511 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012512 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012513 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012514}
12515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518\n\
12519Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012520Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012521is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012524unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012526 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012527 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012529 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12530 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531 return NULL;
12532
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012533 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534}
12535
12536static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012537PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Walter Dörwald346737f2007-05-31 10:44:43 +000012539 if (PyUnicode_CheckExact(self)) {
12540 Py_INCREF(self);
12541 return self;
12542 } else
12543 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012544 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545}
12546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012547PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012548 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549\n\
12550Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012551and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552
12553static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012554unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 return fixup(self, fixswapcase);
12557}
12558
Georg Brandlceee0772007-11-27 23:48:05 +000012559PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012561\n\
12562Return a translation table usable for str.translate().\n\
12563If there is only one argument, it must be a dictionary mapping Unicode\n\
12564ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012565Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012566If there are two arguments, they must be strings of equal length, and\n\
12567in the resulting dictionary, each character in x will be mapped to the\n\
12568character at the same position in y. If there is a third argument, it\n\
12569must be a string, whose characters will be mapped to None in the result.");
12570
12571static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012572unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012573{
12574 PyObject *x, *y = NULL, *z = NULL;
12575 PyObject *new = NULL, *key, *value;
12576 Py_ssize_t i = 0;
12577 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012578
Georg Brandlceee0772007-11-27 23:48:05 +000012579 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12580 return NULL;
12581 new = PyDict_New();
12582 if (!new)
12583 return NULL;
12584 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 int x_kind, y_kind, z_kind;
12586 void *x_data, *y_data, *z_data;
12587
Georg Brandlceee0772007-11-27 23:48:05 +000012588 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012589 if (!PyUnicode_Check(x)) {
12590 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12591 "be a string if there is a second argument");
12592 goto err;
12593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012595 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12596 "arguments must have equal length");
12597 goto err;
12598 }
12599 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 x_kind = PyUnicode_KIND(x);
12601 y_kind = PyUnicode_KIND(y);
12602 x_data = PyUnicode_DATA(x);
12603 y_data = PyUnicode_DATA(y);
12604 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12605 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12606 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012607 if (!key || !value)
12608 goto err;
12609 res = PyDict_SetItem(new, key, value);
12610 Py_DECREF(key);
12611 Py_DECREF(value);
12612 if (res < 0)
12613 goto err;
12614 }
12615 /* create entries for deleting chars in z */
12616 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 z_kind = PyUnicode_KIND(z);
12618 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012619 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012621 if (!key)
12622 goto err;
12623 res = PyDict_SetItem(new, key, Py_None);
12624 Py_DECREF(key);
12625 if (res < 0)
12626 goto err;
12627 }
12628 }
12629 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 int kind;
12631 void *data;
12632
Georg Brandlceee0772007-11-27 23:48:05 +000012633 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012634 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012635 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12636 "to maketrans it must be a dict");
12637 goto err;
12638 }
12639 /* copy entries into the new dict, converting string keys to int keys */
12640 while (PyDict_Next(x, &i, &key, &value)) {
12641 if (PyUnicode_Check(key)) {
12642 /* convert string keys to integer keys */
12643 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012644 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012645 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12646 "table must be of length 1");
12647 goto err;
12648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 kind = PyUnicode_KIND(key);
12650 data = PyUnicode_DATA(key);
12651 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012652 if (!newkey)
12653 goto err;
12654 res = PyDict_SetItem(new, newkey, value);
12655 Py_DECREF(newkey);
12656 if (res < 0)
12657 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012658 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012659 /* just keep integer keys */
12660 if (PyDict_SetItem(new, key, value) < 0)
12661 goto err;
12662 } else {
12663 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12664 "be strings or integers");
12665 goto err;
12666 }
12667 }
12668 }
12669 return new;
12670 err:
12671 Py_DECREF(new);
12672 return NULL;
12673}
12674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012675PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677\n\
12678Return a copy of the string S, where all characters have been mapped\n\
12679through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012680Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012681Unmapped characters are left untouched. Characters mapped to None\n\
12682are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683
12684static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688}
12689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012690PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012693Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012696unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698 return fixup(self, fixupper);
12699}
12700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012701PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012704Pad a numeric string S with zeros on the left, to fill a field\n\
12705of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
12707static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012708unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012711 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012712 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 int kind;
12714 void *data;
12715 Py_UCS4 chr;
12716
12717 if (PyUnicode_READY(self) == -1)
12718 return NULL;
12719
Martin v. Löwis18e16552006-02-15 17:27:45 +000012720 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721 return NULL;
12722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012724 if (PyUnicode_CheckExact(self)) {
12725 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012726 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012727 }
12728 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012729 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 }
12731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733
12734 u = pad(self, fill, 0, '0');
12735
Walter Dörwald068325e2002-04-15 13:36:47 +000012736 if (u == NULL)
12737 return NULL;
12738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 kind = PyUnicode_KIND(u);
12740 data = PyUnicode_DATA(u);
12741 chr = PyUnicode_READ(kind, data, fill);
12742
12743 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 PyUnicode_WRITE(kind, data, 0, chr);
12746 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747 }
12748
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012749 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012750 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752
12753#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012754static PyObject *
12755unicode__decimal2ascii(PyObject *self)
12756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012758}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759#endif
12760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012761PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012764Return True if S starts with the specified prefix, False otherwise.\n\
12765With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766With optional end, stop comparing S at that position.\n\
12767prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
12769static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012771 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012774 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012775 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012776 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778
Jesus Ceaac451502011-04-20 17:09:23 +020012779 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 if (PyTuple_Check(subobj)) {
12782 Py_ssize_t i;
12783 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012784 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 if (substring == NULL)
12786 return NULL;
12787 result = tailmatch(self, substring, start, end, -1);
12788 Py_DECREF(substring);
12789 if (result) {
12790 Py_RETURN_TRUE;
12791 }
12792 }
12793 /* nothing matched */
12794 Py_RETURN_FALSE;
12795 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012796 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012797 if (substring == NULL) {
12798 if (PyErr_ExceptionMatches(PyExc_TypeError))
12799 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12800 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012802 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012803 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012805 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806}
12807
12808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012809PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012812Return True if S ends with the specified suffix, False otherwise.\n\
12813With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814With optional end, stop comparing S at that position.\n\
12815suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
12817static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012818unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012821 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012822 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012823 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012824 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012825 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826
Jesus Ceaac451502011-04-20 17:09:23 +020012827 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012829 if (PyTuple_Check(subobj)) {
12830 Py_ssize_t i;
12831 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012832 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012834 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012836 result = tailmatch(self, substring, start, end, +1);
12837 Py_DECREF(substring);
12838 if (result) {
12839 Py_RETURN_TRUE;
12840 }
12841 }
12842 Py_RETURN_FALSE;
12843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012844 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012845 if (substring == NULL) {
12846 if (PyErr_ExceptionMatches(PyExc_TypeError))
12847 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12848 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012850 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012851 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012852 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012853 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854}
12855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012857
12858PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012860\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012861Return a formatted version of S, using substitutions from args and kwargs.\n\
12862The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012863
Eric Smith27bbca62010-11-04 17:06:58 +000012864PyDoc_STRVAR(format_map__doc__,
12865 "S.format_map(mapping) -> str\n\
12866\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012867Return a formatted version of S, using substitutions from mapping.\n\
12868The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012869
Eric Smith4a7d76d2008-05-30 18:10:19 +000012870static PyObject *
12871unicode__format__(PyObject* self, PyObject* args)
12872{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012873 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874
12875 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12876 return NULL;
12877
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012878 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012879 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012880 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012881}
12882
Eric Smith8c663262007-08-25 02:26:07 +000012883PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012884 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012885\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012886Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012887
12888static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012889unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012891 Py_ssize_t size;
12892
12893 /* If it's a compact object, account for base structure +
12894 character data. */
12895 if (PyUnicode_IS_COMPACT_ASCII(v))
12896 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12897 else if (PyUnicode_IS_COMPACT(v))
12898 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012899 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 else {
12901 /* If it is a two-block object, account for base object, and
12902 for character block if present. */
12903 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012904 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012906 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 }
12908 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012909 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012910 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012912 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012913 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914
12915 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012916}
12917
12918PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012919 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012920
12921static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012922unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012923{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012924 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 if (!copy)
12926 return NULL;
12927 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012928}
12929
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930static PyMethodDef unicode_methods[] = {
12931
12932 /* Order is according to common usage: often used methods should
12933 appear first, since lookup is done sequentially. */
12934
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012935 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12937 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012938 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012939 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12940 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12941 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12942 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12943 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12944 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12945 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012946 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012947 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12948 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12949 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012950 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012951 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12952 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12953 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012954 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012955 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012956 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012957 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012958 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12959 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12960 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12961 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12962 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12963 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12964 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12965 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12966 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12967 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12968 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12969 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12970 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12971 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012972 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012973 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012974 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012975 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012976 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012977 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012978 {"maketrans", (PyCFunction) unicode_maketrans,
12979 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012980 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012981#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012982 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983#endif
12984
12985#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012986 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012987 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988#endif
12989
Benjamin Peterson14339b62009-01-31 16:36:08 +000012990 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991 {NULL, NULL}
12992};
12993
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012994static PyObject *
12995unicode_mod(PyObject *v, PyObject *w)
12996{
Brian Curtindfc80e32011-08-10 20:28:54 -050012997 if (!PyUnicode_Check(v))
12998 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013000}
13001
13002static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 0, /*nb_add*/
13004 0, /*nb_subtract*/
13005 0, /*nb_multiply*/
13006 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013007};
13008
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013010 (lenfunc) unicode_length, /* sq_length */
13011 PyUnicode_Concat, /* sq_concat */
13012 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13013 (ssizeargfunc) unicode_getitem, /* sq_item */
13014 0, /* sq_slice */
13015 0, /* sq_ass_item */
13016 0, /* sq_ass_slice */
13017 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018};
13019
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013020static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013021unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 if (PyUnicode_READY(self) == -1)
13024 return NULL;
13025
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013026 if (PyIndex_Check(item)) {
13027 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013028 if (i == -1 && PyErr_Occurred())
13029 return NULL;
13030 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013031 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013032 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013034 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013035 PyObject *result;
13036 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013037 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013038 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013042 return NULL;
13043 }
13044
13045 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 return PyUnicode_New(0, 0);
13047 } else if (start == 0 && step == 1 &&
13048 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013049 PyUnicode_CheckExact(self)) {
13050 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013051 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013052 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013053 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013054 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013055 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013056 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013057 src_kind = PyUnicode_KIND(self);
13058 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013059 if (!PyUnicode_IS_ASCII(self)) {
13060 kind_limit = kind_maxchar_limit(src_kind);
13061 max_char = 0;
13062 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13063 ch = PyUnicode_READ(src_kind, src_data, cur);
13064 if (ch > max_char) {
13065 max_char = ch;
13066 if (max_char >= kind_limit)
13067 break;
13068 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013069 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 }
Victor Stinner55c99112011-10-13 01:17:06 +020013071 else
13072 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013074 if (result == NULL)
13075 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013076 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013077 dest_data = PyUnicode_DATA(result);
13078
13079 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013080 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13081 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013082 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013083 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013084 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013085 } else {
13086 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13087 return NULL;
13088 }
13089}
13090
13091static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013092 (lenfunc)unicode_length, /* mp_length */
13093 (binaryfunc)unicode_subscript, /* mp_subscript */
13094 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013095};
13096
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098/* Helpers for PyUnicode_Format() */
13099
13100static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013101getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013103 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 (*p_argidx)++;
13106 if (arglen < 0)
13107 return args;
13108 else
13109 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 }
13111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 return NULL;
13114}
13115
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013116/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013118static PyObject *
13119formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013121 char *p;
13122 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013124
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 x = PyFloat_AsDouble(v);
13126 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013127 return NULL;
13128
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013131
Eric Smith0923d1d2009-04-16 20:16:10 +000013132 p = PyOS_double_to_string(x, type, prec,
13133 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013134 if (p == NULL)
13135 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013137 PyMem_Free(p);
13138 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
13140
Tim Peters38fd5b62000-09-21 05:43:11 +000013141static PyObject*
13142formatlong(PyObject *val, int flags, int prec, int type)
13143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013144 char *buf;
13145 int len;
13146 PyObject *str; /* temporary string object. */
13147 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013148
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13150 if (!str)
13151 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013153 Py_DECREF(str);
13154 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013155}
13156
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157static Py_UCS4
13158formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013160 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013161 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013162 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013163 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 goto onError;
13166 }
13167 else {
13168 /* Integer input truncated to a character */
13169 long x;
13170 x = PyLong_AsLong(v);
13171 if (x == -1 && PyErr_Occurred())
13172 goto onError;
13173
13174 if (x < 0 || x > 0x10ffff) {
13175 PyErr_SetString(PyExc_OverflowError,
13176 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013177 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 }
13179
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013180 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013182
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013184 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013185 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013186 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187}
13188
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013189static int
13190repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13191{
13192 int r;
13193 assert(count > 0);
13194 assert(PyUnicode_Check(obj));
13195 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013196 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013197 if (repeated == NULL)
13198 return -1;
13199 r = _PyAccu_Accumulate(acc, repeated);
13200 Py_DECREF(repeated);
13201 return r;
13202 }
13203 else {
13204 do {
13205 if (_PyAccu_Accumulate(acc, obj))
13206 return -1;
13207 } while (--count);
13208 return 0;
13209 }
13210}
13211
Alexander Belopolsky40018472011-02-26 01:02:56 +000013212PyObject *
13213PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 void *fmt;
13216 int fmtkind;
13217 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013219 int r;
13220 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013223 PyObject *temp = NULL;
13224 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013225 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013226 _PyAccu acc;
13227 static PyObject *plus, *minus, *blank, *zero, *percent;
13228
13229 if (!plus && !(plus = get_latin1_char('+')))
13230 return NULL;
13231 if (!minus && !(minus = get_latin1_char('-')))
13232 return NULL;
13233 if (!blank && !(blank = get_latin1_char(' ')))
13234 return NULL;
13235 if (!zero && !(zero = get_latin1_char('0')))
13236 return NULL;
13237 if (!percent && !(percent = get_latin1_char('%')))
13238 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013239
Guido van Rossumd57fd912000-03-10 22:53:23 +000013240 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013241 PyErr_BadInternalCall();
13242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013244 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013247 if (_PyAccu_Init(&acc))
13248 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 fmt = PyUnicode_DATA(uformat);
13250 fmtkind = PyUnicode_KIND(uformat);
13251 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13252 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 arglen = PyTuple_Size(args);
13256 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 }
13258 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 arglen = -1;
13260 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013262 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013263 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013265
13266 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013267 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013268 PyObject *nonfmt;
13269 Py_ssize_t nonfmtpos;
13270 nonfmtpos = fmtpos++;
13271 while (fmtcnt >= 0 &&
13272 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13273 fmtpos++;
13274 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013276 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013277 if (nonfmt == NULL)
13278 goto onError;
13279 r = _PyAccu_Accumulate(&acc, nonfmt);
13280 Py_DECREF(nonfmt);
13281 if (r)
13282 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013283 }
13284 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 /* Got a format specifier */
13286 int flags = 0;
13287 Py_ssize_t width = -1;
13288 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013290 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013291 int isnumok;
13292 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013293 void *pbuf = NULL;
13294 Py_ssize_t pindex, len;
13295 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 fmtpos++;
13298 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13299 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 Py_ssize_t keylen;
13301 PyObject *key;
13302 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013303
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 if (dict == NULL) {
13305 PyErr_SetString(PyExc_TypeError,
13306 "format requires a mapping");
13307 goto onError;
13308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 /* Skip over balanced parentheses */
13313 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 if (fmtcnt < 0 || pcount > 0) {
13322 PyErr_SetString(PyExc_ValueError,
13323 "incomplete format key");
13324 goto onError;
13325 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013326 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013327 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 if (key == NULL)
13329 goto onError;
13330 if (args_owned) {
13331 Py_DECREF(args);
13332 args_owned = 0;
13333 }
13334 args = PyObject_GetItem(dict, key);
13335 Py_DECREF(key);
13336 if (args == NULL) {
13337 goto onError;
13338 }
13339 args_owned = 1;
13340 arglen = -1;
13341 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013342 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 case '-': flags |= F_LJUST; continue;
13346 case '+': flags |= F_SIGN; continue;
13347 case ' ': flags |= F_BLANK; continue;
13348 case '#': flags |= F_ALT; continue;
13349 case '0': flags |= F_ZERO; continue;
13350 }
13351 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013352 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013353 if (c == '*') {
13354 v = getnextarg(args, arglen, &argidx);
13355 if (v == NULL)
13356 goto onError;
13357 if (!PyLong_Check(v)) {
13358 PyErr_SetString(PyExc_TypeError,
13359 "* wants int");
13360 goto onError;
13361 }
13362 width = PyLong_AsLong(v);
13363 if (width == -1 && PyErr_Occurred())
13364 goto onError;
13365 if (width < 0) {
13366 flags |= F_LJUST;
13367 width = -width;
13368 }
13369 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 }
13372 else if (c >= '0' && c <= '9') {
13373 width = c - '0';
13374 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013375 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 if (c < '0' || c > '9')
13377 break;
13378 if ((width*10) / 10 != width) {
13379 PyErr_SetString(PyExc_ValueError,
13380 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013381 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 }
13383 width = width*10 + (c - '0');
13384 }
13385 }
13386 if (c == '.') {
13387 prec = 0;
13388 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 if (c == '*') {
13391 v = getnextarg(args, arglen, &argidx);
13392 if (v == NULL)
13393 goto onError;
13394 if (!PyLong_Check(v)) {
13395 PyErr_SetString(PyExc_TypeError,
13396 "* wants int");
13397 goto onError;
13398 }
13399 prec = PyLong_AsLong(v);
13400 if (prec == -1 && PyErr_Occurred())
13401 goto onError;
13402 if (prec < 0)
13403 prec = 0;
13404 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 }
13407 else if (c >= '0' && c <= '9') {
13408 prec = c - '0';
13409 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 if (c < '0' || c > '9')
13412 break;
13413 if ((prec*10) / 10 != prec) {
13414 PyErr_SetString(PyExc_ValueError,
13415 "prec too big");
13416 goto onError;
13417 }
13418 prec = prec*10 + (c - '0');
13419 }
13420 }
13421 } /* prec */
13422 if (fmtcnt >= 0) {
13423 if (c == 'h' || c == 'l' || c == 'L') {
13424 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013425 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 }
13427 }
13428 if (fmtcnt < 0) {
13429 PyErr_SetString(PyExc_ValueError,
13430 "incomplete format");
13431 goto onError;
13432 }
13433 if (c != '%') {
13434 v = getnextarg(args, arglen, &argidx);
13435 if (v == NULL)
13436 goto onError;
13437 }
13438 sign = 0;
13439 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013440 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 switch (c) {
13442
13443 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013444 _PyAccu_Accumulate(&acc, percent);
13445 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013446
13447 case 's':
13448 case 'r':
13449 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013450 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 temp = v;
13452 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 }
13454 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 if (c == 's')
13456 temp = PyObject_Str(v);
13457 else if (c == 'r')
13458 temp = PyObject_Repr(v);
13459 else
13460 temp = PyObject_ASCII(v);
13461 if (temp == NULL)
13462 goto onError;
13463 if (PyUnicode_Check(temp))
13464 /* nothing to do */;
13465 else {
13466 Py_DECREF(temp);
13467 PyErr_SetString(PyExc_TypeError,
13468 "%s argument has non-string str()");
13469 goto onError;
13470 }
13471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 if (PyUnicode_READY(temp) == -1) {
13473 Py_CLEAR(temp);
13474 goto onError;
13475 }
13476 pbuf = PyUnicode_DATA(temp);
13477 kind = PyUnicode_KIND(temp);
13478 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 if (prec >= 0 && len > prec)
13480 len = prec;
13481 break;
13482
13483 case 'i':
13484 case 'd':
13485 case 'u':
13486 case 'o':
13487 case 'x':
13488 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 isnumok = 0;
13490 if (PyNumber_Check(v)) {
13491 PyObject *iobj=NULL;
13492
13493 if (PyLong_Check(v)) {
13494 iobj = v;
13495 Py_INCREF(iobj);
13496 }
13497 else {
13498 iobj = PyNumber_Long(v);
13499 }
13500 if (iobj!=NULL) {
13501 if (PyLong_Check(iobj)) {
13502 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013503 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 Py_DECREF(iobj);
13505 if (!temp)
13506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013507 if (PyUnicode_READY(temp) == -1) {
13508 Py_CLEAR(temp);
13509 goto onError;
13510 }
13511 pbuf = PyUnicode_DATA(temp);
13512 kind = PyUnicode_KIND(temp);
13513 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 sign = 1;
13515 }
13516 else {
13517 Py_DECREF(iobj);
13518 }
13519 }
13520 }
13521 if (!isnumok) {
13522 PyErr_Format(PyExc_TypeError,
13523 "%%%c format: a number is required, "
13524 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13525 goto onError;
13526 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013527 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013529 fillobj = zero;
13530 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 break;
13532
13533 case 'e':
13534 case 'E':
13535 case 'f':
13536 case 'F':
13537 case 'g':
13538 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013539 temp = formatfloat(v, flags, prec, c);
13540 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013542 if (PyUnicode_READY(temp) == -1) {
13543 Py_CLEAR(temp);
13544 goto onError;
13545 }
13546 pbuf = PyUnicode_DATA(temp);
13547 kind = PyUnicode_KIND(temp);
13548 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013550 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013552 fillobj = zero;
13553 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 break;
13555
13556 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013557 {
13558 Py_UCS4 ch = formatchar(v);
13559 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013561 temp = _PyUnicode_FromUCS4(&ch, 1);
13562 if (temp == NULL)
13563 goto onError;
13564 pbuf = PyUnicode_DATA(temp);
13565 kind = PyUnicode_KIND(temp);
13566 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013568 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013569
13570 default:
13571 PyErr_Format(PyExc_ValueError,
13572 "unsupported format character '%c' (0x%x) "
13573 "at index %zd",
13574 (31<=c && c<=126) ? (char)c : '?',
13575 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 goto onError;
13578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 /* pbuf is initialized here. */
13580 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013582 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13583 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 pindex++;
13586 }
13587 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13588 signobj = plus;
13589 len--;
13590 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 }
13592 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013593 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 else
13597 sign = 0;
13598 }
13599 if (width < len)
13600 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013602 if (fill != ' ') {
13603 assert(signobj != NULL);
13604 if (_PyAccu_Accumulate(&acc, signobj))
13605 goto onError;
13606 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 if (width > len)
13608 width--;
13609 }
13610 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013612 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013614 second = get_latin1_char(
13615 PyUnicode_READ(kind, pbuf, pindex + 1));
13616 pindex += 2;
13617 if (second == NULL ||
13618 _PyAccu_Accumulate(&acc, zero) ||
13619 _PyAccu_Accumulate(&acc, second))
13620 goto onError;
13621 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 width -= 2;
13624 if (width < 0)
13625 width = 0;
13626 len -= 2;
13627 }
13628 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013629 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013630 if (repeat_accumulate(&acc, fillobj, width - len))
13631 goto onError;
13632 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 }
13634 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013635 if (sign) {
13636 assert(signobj != NULL);
13637 if (_PyAccu_Accumulate(&acc, signobj))
13638 goto onError;
13639 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13642 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013643 second = get_latin1_char(
13644 PyUnicode_READ(kind, pbuf, pindex + 1));
13645 pindex += 2;
13646 if (second == NULL ||
13647 _PyAccu_Accumulate(&acc, zero) ||
13648 _PyAccu_Accumulate(&acc, second))
13649 goto onError;
13650 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 }
13652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013654 if (temp != NULL) {
13655 assert(pbuf == PyUnicode_DATA(temp));
13656 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013657 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013658 else {
13659 const char *p = (const char *) pbuf;
13660 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013661 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013662 v = PyUnicode_FromKindAndData(kind, p, len);
13663 }
13664 if (v == NULL)
13665 goto onError;
13666 r = _PyAccu_Accumulate(&acc, v);
13667 Py_DECREF(v);
13668 if (r)
13669 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013670 if (width > len && repeat_accumulate(&acc, blank, width - len))
13671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013672 if (dict && (argidx < arglen) && c != '%') {
13673 PyErr_SetString(PyExc_TypeError,
13674 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 goto onError;
13676 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013677 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679 } /* until end */
13680 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 PyErr_SetString(PyExc_TypeError,
13682 "not all arguments converted during string formatting");
13683 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684 }
13685
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013686 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 }
13690 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 Py_XDECREF(temp);
13692 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013693 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013694
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013696 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013697 Py_XDECREF(temp);
13698 Py_XDECREF(second);
13699 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 }
13703 return NULL;
13704}
13705
Jeremy Hylton938ace62002-07-17 16:30:39 +000013706static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013707unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13708
Tim Peters6d6c1a32001-08-02 04:15:00 +000013709static PyObject *
13710unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13711{
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013713 static char *kwlist[] = {"object", "encoding", "errors", 0};
13714 char *encoding = NULL;
13715 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013716
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 if (type != &PyUnicode_Type)
13718 return unicode_subtype_new(type, args, kwds);
13719 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013721 return NULL;
13722 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013723 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013724 if (encoding == NULL && errors == NULL)
13725 return PyObject_Str(x);
13726 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013728}
13729
Guido van Rossume023fe02001-08-30 03:12:59 +000013730static PyObject *
13731unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13732{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013733 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013734 Py_ssize_t length, char_size;
13735 int share_wstr, share_utf8;
13736 unsigned int kind;
13737 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013738
Benjamin Peterson14339b62009-01-31 16:36:08 +000013739 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013740
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013741 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013742 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013743 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013744 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013745 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013746 return NULL;
13747
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013748 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013749 if (self == NULL) {
13750 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013751 return NULL;
13752 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753 kind = PyUnicode_KIND(unicode);
13754 length = PyUnicode_GET_LENGTH(unicode);
13755
13756 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013757#ifdef Py_DEBUG
13758 _PyUnicode_HASH(self) = -1;
13759#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013760 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013761#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013762 _PyUnicode_STATE(self).interned = 0;
13763 _PyUnicode_STATE(self).kind = kind;
13764 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013765 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013766 _PyUnicode_STATE(self).ready = 1;
13767 _PyUnicode_WSTR(self) = NULL;
13768 _PyUnicode_UTF8_LENGTH(self) = 0;
13769 _PyUnicode_UTF8(self) = NULL;
13770 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013771 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013772
13773 share_utf8 = 0;
13774 share_wstr = 0;
13775 if (kind == PyUnicode_1BYTE_KIND) {
13776 char_size = 1;
13777 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13778 share_utf8 = 1;
13779 }
13780 else if (kind == PyUnicode_2BYTE_KIND) {
13781 char_size = 2;
13782 if (sizeof(wchar_t) == 2)
13783 share_wstr = 1;
13784 }
13785 else {
13786 assert(kind == PyUnicode_4BYTE_KIND);
13787 char_size = 4;
13788 if (sizeof(wchar_t) == 4)
13789 share_wstr = 1;
13790 }
13791
13792 /* Ensure we won't overflow the length. */
13793 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13794 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013796 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013797 data = PyObject_MALLOC((length + 1) * char_size);
13798 if (data == NULL) {
13799 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013800 goto onError;
13801 }
13802
Victor Stinnerc3c74152011-10-02 20:39:55 +020013803 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013804 if (share_utf8) {
13805 _PyUnicode_UTF8_LENGTH(self) = length;
13806 _PyUnicode_UTF8(self) = data;
13807 }
13808 if (share_wstr) {
13809 _PyUnicode_WSTR_LENGTH(self) = length;
13810 _PyUnicode_WSTR(self) = (wchar_t *)data;
13811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013813 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013814 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013815 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013816#ifdef Py_DEBUG
13817 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13818#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013819 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013820 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013821
13822onError:
13823 Py_DECREF(unicode);
13824 Py_DECREF(self);
13825 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013826}
13827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013828PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013830\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013831Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013832encoding defaults to the current default string encoding.\n\
13833errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013834
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013835static PyObject *unicode_iter(PyObject *seq);
13836
Guido van Rossumd57fd912000-03-10 22:53:23 +000013837PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013838 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013839 "str", /* tp_name */
13840 sizeof(PyUnicodeObject), /* tp_size */
13841 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013843 (destructor)unicode_dealloc, /* tp_dealloc */
13844 0, /* tp_print */
13845 0, /* tp_getattr */
13846 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013847 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013848 unicode_repr, /* tp_repr */
13849 &unicode_as_number, /* tp_as_number */
13850 &unicode_as_sequence, /* tp_as_sequence */
13851 &unicode_as_mapping, /* tp_as_mapping */
13852 (hashfunc) unicode_hash, /* tp_hash*/
13853 0, /* tp_call*/
13854 (reprfunc) unicode_str, /* tp_str */
13855 PyObject_GenericGetAttr, /* tp_getattro */
13856 0, /* tp_setattro */
13857 0, /* tp_as_buffer */
13858 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013860 unicode_doc, /* tp_doc */
13861 0, /* tp_traverse */
13862 0, /* tp_clear */
13863 PyUnicode_RichCompare, /* tp_richcompare */
13864 0, /* tp_weaklistoffset */
13865 unicode_iter, /* tp_iter */
13866 0, /* tp_iternext */
13867 unicode_methods, /* tp_methods */
13868 0, /* tp_members */
13869 0, /* tp_getset */
13870 &PyBaseObject_Type, /* tp_base */
13871 0, /* tp_dict */
13872 0, /* tp_descr_get */
13873 0, /* tp_descr_set */
13874 0, /* tp_dictoffset */
13875 0, /* tp_init */
13876 0, /* tp_alloc */
13877 unicode_new, /* tp_new */
13878 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013879};
13880
13881/* Initialize the Unicode implementation */
13882
Victor Stinner3a50e702011-10-18 21:21:00 +020013883int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013884{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013885 int i;
13886
Thomas Wouters477c8d52006-05-27 19:21:47 +000013887 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013888 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013889 0x000A, /* LINE FEED */
13890 0x000D, /* CARRIAGE RETURN */
13891 0x001C, /* FILE SEPARATOR */
13892 0x001D, /* GROUP SEPARATOR */
13893 0x001E, /* RECORD SEPARATOR */
13894 0x0085, /* NEXT LINE */
13895 0x2028, /* LINE SEPARATOR */
13896 0x2029, /* PARAGRAPH SEPARATOR */
13897 };
13898
Fred Drakee4315f52000-05-09 19:53:39 +000013899 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013900 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013901 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013903 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013904
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013905 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013907 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013908 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013909
13910 /* initialize the linebreak bloom filter */
13911 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013912 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013913 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013914
13915 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013916
13917#ifdef HAVE_MBCS
13918 winver.dwOSVersionInfoSize = sizeof(winver);
13919 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13920 PyErr_SetFromWindowsErr(0);
13921 return -1;
13922 }
13923#endif
13924 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925}
13926
13927/* Finalize the Unicode implementation */
13928
Christian Heimesa156e092008-02-16 07:38:31 +000013929int
13930PyUnicode_ClearFreeList(void)
13931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013932 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013933}
13934
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935void
Thomas Wouters78890102000-07-22 19:25:51 +000013936_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013938 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013940 Py_XDECREF(unicode_empty);
13941 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013942
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013943 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013944 if (unicode_latin1[i]) {
13945 Py_DECREF(unicode_latin1[i]);
13946 unicode_latin1[i] = NULL;
13947 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013948 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013949 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013950 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013952
Walter Dörwald16807132007-05-25 13:52:07 +000013953void
13954PyUnicode_InternInPlace(PyObject **p)
13955{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013956 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013958#ifdef Py_DEBUG
13959 assert(s != NULL);
13960 assert(_PyUnicode_CHECK(s));
13961#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013963 return;
13964#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 /* If it's a subclass, we don't really know what putting
13966 it in the interned dict might do. */
13967 if (!PyUnicode_CheckExact(s))
13968 return;
13969 if (PyUnicode_CHECK_INTERNED(s))
13970 return;
13971 if (interned == NULL) {
13972 interned = PyDict_New();
13973 if (interned == NULL) {
13974 PyErr_Clear(); /* Don't leave an exception */
13975 return;
13976 }
13977 }
13978 /* It might be that the GetItem call fails even
13979 though the key is present in the dictionary,
13980 namely when this happens during a stack overflow. */
13981 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013982 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013983 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013984
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 if (t) {
13986 Py_INCREF(t);
13987 Py_DECREF(*p);
13988 *p = t;
13989 return;
13990 }
Walter Dörwald16807132007-05-25 13:52:07 +000013991
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013993 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 PyErr_Clear();
13995 PyThreadState_GET()->recursion_critical = 0;
13996 return;
13997 }
13998 PyThreadState_GET()->recursion_critical = 0;
13999 /* The two references in interned are not counted by refcnt.
14000 The deallocator will take care of this */
14001 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014002 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014003}
14004
14005void
14006PyUnicode_InternImmortal(PyObject **p)
14007{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 PyUnicode_InternInPlace(p);
14009 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014010 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 Py_INCREF(*p);
14012 }
Walter Dörwald16807132007-05-25 13:52:07 +000014013}
14014
14015PyObject *
14016PyUnicode_InternFromString(const char *cp)
14017{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 PyObject *s = PyUnicode_FromString(cp);
14019 if (s == NULL)
14020 return NULL;
14021 PyUnicode_InternInPlace(&s);
14022 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014023}
14024
Alexander Belopolsky40018472011-02-26 01:02:56 +000014025void
14026_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014027{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014029 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 Py_ssize_t i, n;
14031 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014032
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 if (interned == NULL || !PyDict_Check(interned))
14034 return;
14035 keys = PyDict_Keys(interned);
14036 if (keys == NULL || !PyList_Check(keys)) {
14037 PyErr_Clear();
14038 return;
14039 }
Walter Dörwald16807132007-05-25 13:52:07 +000014040
Benjamin Peterson14339b62009-01-31 16:36:08 +000014041 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14042 detector, interned unicode strings are not forcibly deallocated;
14043 rather, we give them their stolen references back, and then clear
14044 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014045
Benjamin Peterson14339b62009-01-31 16:36:08 +000014046 n = PyList_GET_SIZE(keys);
14047 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014048 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014050 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014051 if (PyUnicode_READY(s) == -1) {
14052 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014055 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014056 case SSTATE_NOT_INTERNED:
14057 /* XXX Shouldn't happen */
14058 break;
14059 case SSTATE_INTERNED_IMMORTAL:
14060 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014061 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 break;
14063 case SSTATE_INTERNED_MORTAL:
14064 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014065 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 break;
14067 default:
14068 Py_FatalError("Inconsistent interned string state.");
14069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014070 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014071 }
14072 fprintf(stderr, "total size of all interned strings: "
14073 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14074 "mortal/immortal\n", mortal_size, immortal_size);
14075 Py_DECREF(keys);
14076 PyDict_Clear(interned);
14077 Py_DECREF(interned);
14078 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014079}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014080
14081
14082/********************* Unicode Iterator **************************/
14083
14084typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014085 PyObject_HEAD
14086 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014087 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014088} unicodeiterobject;
14089
14090static void
14091unicodeiter_dealloc(unicodeiterobject *it)
14092{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014093 _PyObject_GC_UNTRACK(it);
14094 Py_XDECREF(it->it_seq);
14095 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014096}
14097
14098static int
14099unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014101 Py_VISIT(it->it_seq);
14102 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014103}
14104
14105static PyObject *
14106unicodeiter_next(unicodeiterobject *it)
14107{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014108 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014109
Benjamin Peterson14339b62009-01-31 16:36:08 +000014110 assert(it != NULL);
14111 seq = it->it_seq;
14112 if (seq == NULL)
14113 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014114 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014116 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14117 int kind = PyUnicode_KIND(seq);
14118 void *data = PyUnicode_DATA(seq);
14119 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14120 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 if (item != NULL)
14122 ++it->it_index;
14123 return item;
14124 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014125
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 Py_DECREF(seq);
14127 it->it_seq = NULL;
14128 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014129}
14130
14131static PyObject *
14132unicodeiter_len(unicodeiterobject *it)
14133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014134 Py_ssize_t len = 0;
14135 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014136 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014137 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014138}
14139
14140PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14141
14142static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014146};
14147
14148PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14150 "str_iterator", /* tp_name */
14151 sizeof(unicodeiterobject), /* tp_basicsize */
14152 0, /* tp_itemsize */
14153 /* methods */
14154 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14155 0, /* tp_print */
14156 0, /* tp_getattr */
14157 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014158 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 0, /* tp_repr */
14160 0, /* tp_as_number */
14161 0, /* tp_as_sequence */
14162 0, /* tp_as_mapping */
14163 0, /* tp_hash */
14164 0, /* tp_call */
14165 0, /* tp_str */
14166 PyObject_GenericGetAttr, /* tp_getattro */
14167 0, /* tp_setattro */
14168 0, /* tp_as_buffer */
14169 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14170 0, /* tp_doc */
14171 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14172 0, /* tp_clear */
14173 0, /* tp_richcompare */
14174 0, /* tp_weaklistoffset */
14175 PyObject_SelfIter, /* tp_iter */
14176 (iternextfunc)unicodeiter_next, /* tp_iternext */
14177 unicodeiter_methods, /* tp_methods */
14178 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014179};
14180
14181static PyObject *
14182unicode_iter(PyObject *seq)
14183{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014184 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014185
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 if (!PyUnicode_Check(seq)) {
14187 PyErr_BadInternalCall();
14188 return NULL;
14189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014190 if (PyUnicode_READY(seq) == -1)
14191 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014192 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14193 if (it == NULL)
14194 return NULL;
14195 it->it_index = 0;
14196 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014197 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014198 _PyObject_GC_TRACK(it);
14199 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014200}
14201
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014202
14203size_t
14204Py_UNICODE_strlen(const Py_UNICODE *u)
14205{
14206 int res = 0;
14207 while(*u++)
14208 res++;
14209 return res;
14210}
14211
14212Py_UNICODE*
14213Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14214{
14215 Py_UNICODE *u = s1;
14216 while ((*u++ = *s2++));
14217 return s1;
14218}
14219
14220Py_UNICODE*
14221Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14222{
14223 Py_UNICODE *u = s1;
14224 while ((*u++ = *s2++))
14225 if (n-- == 0)
14226 break;
14227 return s1;
14228}
14229
14230Py_UNICODE*
14231Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14232{
14233 Py_UNICODE *u1 = s1;
14234 u1 += Py_UNICODE_strlen(u1);
14235 Py_UNICODE_strcpy(u1, s2);
14236 return s1;
14237}
14238
14239int
14240Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14241{
14242 while (*s1 && *s2 && *s1 == *s2)
14243 s1++, s2++;
14244 if (*s1 && *s2)
14245 return (*s1 < *s2) ? -1 : +1;
14246 if (*s1)
14247 return 1;
14248 if (*s2)
14249 return -1;
14250 return 0;
14251}
14252
14253int
14254Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14255{
14256 register Py_UNICODE u1, u2;
14257 for (; n != 0; n--) {
14258 u1 = *s1;
14259 u2 = *s2;
14260 if (u1 != u2)
14261 return (u1 < u2) ? -1 : +1;
14262 if (u1 == '\0')
14263 return 0;
14264 s1++;
14265 s2++;
14266 }
14267 return 0;
14268}
14269
14270Py_UNICODE*
14271Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14272{
14273 const Py_UNICODE *p;
14274 for (p = s; *p; p++)
14275 if (*p == c)
14276 return (Py_UNICODE*)p;
14277 return NULL;
14278}
14279
14280Py_UNICODE*
14281Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14282{
14283 const Py_UNICODE *p;
14284 p = s + Py_UNICODE_strlen(s);
14285 while (p != s) {
14286 p--;
14287 if (*p == c)
14288 return (Py_UNICODE*)p;
14289 }
14290 return NULL;
14291}
Victor Stinner331ea922010-08-10 16:37:20 +000014292
Victor Stinner71133ff2010-09-01 23:43:53 +000014293Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014294PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014295{
Victor Stinner577db2c2011-10-11 22:12:48 +020014296 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014297 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 if (!PyUnicode_Check(unicode)) {
14300 PyErr_BadArgument();
14301 return NULL;
14302 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014303 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014304 if (u == NULL)
14305 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014306 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014307 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014308 PyErr_NoMemory();
14309 return NULL;
14310 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014311 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014312 size *= sizeof(Py_UNICODE);
14313 copy = PyMem_Malloc(size);
14314 if (copy == NULL) {
14315 PyErr_NoMemory();
14316 return NULL;
14317 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014318 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014319 return copy;
14320}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014321
Georg Brandl66c221e2010-10-14 07:04:07 +000014322/* A _string module, to export formatter_parser and formatter_field_name_split
14323 to the string.Formatter class implemented in Python. */
14324
14325static PyMethodDef _string_methods[] = {
14326 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14327 METH_O, PyDoc_STR("split the argument as a field name")},
14328 {"formatter_parser", (PyCFunction) formatter_parser,
14329 METH_O, PyDoc_STR("parse the argument as a format string")},
14330 {NULL, NULL}
14331};
14332
14333static struct PyModuleDef _string_module = {
14334 PyModuleDef_HEAD_INIT,
14335 "_string",
14336 PyDoc_STR("string helper module"),
14337 0,
14338 _string_methods,
14339 NULL,
14340 NULL,
14341 NULL,
14342 NULL
14343};
14344
14345PyMODINIT_FUNC
14346PyInit__string(void)
14347{
14348 return PyModule_Create(&_string_module);
14349}
14350
14351
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014352#ifdef __cplusplus
14353}
14354#endif