blob: ce6a19610389e958b2cde9c0c7677275208a78eb [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinnerc4b49542011-12-11 22:44:26 +0100490static PyObject*
491unicode_result_unchanged(PyObject *unicode)
492{
493 if (PyUnicode_CheckExact(unicode)) {
494 if (PyUnicode_READY(unicode) < 0)
495 return NULL;
496 Py_INCREF(unicode);
497 return unicode;
498 }
499 else
500 /* Subtype -- return genuine unicode string with the same value. */
501 return PyUnicode_Copy(unicode);
502}
503
Victor Stinner3a50e702011-10-18 21:21:00 +0200504#ifdef HAVE_MBCS
505static OSVERSIONINFOEX winver;
506#endif
507
Thomas Wouters477c8d52006-05-27 19:21:47 +0000508/* --- Bloom Filters ----------------------------------------------------- */
509
510/* stuff to implement simple "bloom filters" for Unicode characters.
511 to keep things simple, we use a single bitmask, using the least 5
512 bits from each unicode characters as the bit index. */
513
514/* the linebreak mask is set up by Unicode_Init below */
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#if LONG_BIT >= 128
517#define BLOOM_WIDTH 128
518#elif LONG_BIT >= 64
519#define BLOOM_WIDTH 64
520#elif LONG_BIT >= 32
521#define BLOOM_WIDTH 32
522#else
523#error "LONG_BIT is smaller than 32"
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526#define BLOOM_MASK unsigned long
527
528static BLOOM_MASK bloom_linebreak;
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
531#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Benjamin Peterson29060642009-01-31 22:14:21 +0000533#define BLOOM_LINEBREAK(ch) \
534 ((ch) < 128U ? ascii_linebreak[(ch)] : \
535 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000536
Alexander Belopolsky40018472011-02-26 01:02:56 +0000537Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539{
540 /* calculate simple bloom-style bitmask for a given unicode string */
541
Antoine Pitrouf068f942010-01-13 14:19:12 +0000542 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543 Py_ssize_t i;
544
545 mask = 0;
546 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548
549 return mask;
550}
551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552#define BLOOM_MEMBER(mask, chr, str) \
553 (BLOOM(mask, chr) \
554 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000555
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200556/* Compilation of templated routines */
557
558#include "stringlib/asciilib.h"
559#include "stringlib/fastsearch.h"
560#include "stringlib/partition.h"
561#include "stringlib/split.h"
562#include "stringlib/count.h"
563#include "stringlib/find.h"
564#include "stringlib/find_max_char.h"
565#include "stringlib/localeutil.h"
566#include "stringlib/undef.h"
567
568#include "stringlib/ucs1lib.h"
569#include "stringlib/fastsearch.h"
570#include "stringlib/partition.h"
571#include "stringlib/split.h"
572#include "stringlib/count.h"
573#include "stringlib/find.h"
574#include "stringlib/find_max_char.h"
575#include "stringlib/localeutil.h"
576#include "stringlib/undef.h"
577
578#include "stringlib/ucs2lib.h"
579#include "stringlib/fastsearch.h"
580#include "stringlib/partition.h"
581#include "stringlib/split.h"
582#include "stringlib/count.h"
583#include "stringlib/find.h"
584#include "stringlib/find_max_char.h"
585#include "stringlib/localeutil.h"
586#include "stringlib/undef.h"
587
588#include "stringlib/ucs4lib.h"
589#include "stringlib/fastsearch.h"
590#include "stringlib/partition.h"
591#include "stringlib/split.h"
592#include "stringlib/count.h"
593#include "stringlib/find.h"
594#include "stringlib/find_max_char.h"
595#include "stringlib/localeutil.h"
596#include "stringlib/undef.h"
597
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598#include "stringlib/unicodedefs.h"
599#include "stringlib/fastsearch.h"
600#include "stringlib/count.h"
601#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100602#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200603
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604/* --- Unicode Object ----------------------------------------------------- */
605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200606static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200607fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
610 Py_ssize_t size, Py_UCS4 ch,
611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200612{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200613 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
614
615 switch (kind) {
616 case PyUnicode_1BYTE_KIND:
617 {
618 Py_UCS1 ch1 = (Py_UCS1) ch;
619 if (ch1 == ch)
620 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
621 else
622 return -1;
623 }
624 case PyUnicode_2BYTE_KIND:
625 {
626 Py_UCS2 ch2 = (Py_UCS2) ch;
627 if (ch2 == ch)
628 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
629 else
630 return -1;
631 }
632 case PyUnicode_4BYTE_KIND:
633 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
634 default:
635 assert(0);
636 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200638}
639
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640static PyObject*
641resize_compact(PyObject *unicode, Py_ssize_t length)
642{
643 Py_ssize_t char_size;
644 Py_ssize_t struct_size;
645 Py_ssize_t new_size;
646 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100647 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648
649 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200651 if (PyUnicode_IS_COMPACT_ASCII(unicode))
652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100658 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 PyErr_NoMemory();
660 return NULL;
661 }
662 new_size = (struct_size + (length + 1) * char_size);
663
Victor Stinner84def372011-12-11 20:04:56 +0100664 _Py_DEC_REFTOTAL;
665 _Py_ForgetReference(unicode);
666
667 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
668 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyObject_Del(unicode);
670 PyErr_NoMemory();
671 return NULL;
672 }
Victor Stinner84def372011-12-11 20:04:56 +0100673 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100675
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200677 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
680 _PyUnicode_WSTR_LENGTH(unicode) = length;
681 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
683 length, 0);
684 return unicode;
685}
686
Alexander Belopolsky40018472011-02-26 01:02:56 +0000687static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200688resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689{
Victor Stinner95663112011-10-04 01:03:50 +0200690 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinner95663112011-10-04 01:03:50 +0200694 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695
696 if (PyUnicode_IS_READY(unicode)) {
697 Py_ssize_t char_size;
698 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200699 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700 void *data;
701
702 data = _PyUnicode_DATA_ANY(unicode);
703 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200704 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200705 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
706 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200707 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
708 {
709 PyObject_DEL(_PyUnicode_UTF8(unicode));
710 _PyUnicode_UTF8(unicode) = NULL;
711 _PyUnicode_UTF8_LENGTH(unicode) = 0;
712 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713
714 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
715 PyErr_NoMemory();
716 return -1;
717 }
718 new_size = (length + 1) * char_size;
719
720 data = (PyObject *)PyObject_REALLOC(data, new_size);
721 if (data == NULL) {
722 PyErr_NoMemory();
723 return -1;
724 }
725 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200728 _PyUnicode_WSTR_LENGTH(unicode) = length;
729 }
730 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200731 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200732 _PyUnicode_UTF8_LENGTH(unicode) = length;
733 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_LENGTH(unicode) = length;
735 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200736 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200737 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 }
Victor Stinner95663112011-10-04 01:03:50 +0200741 assert(_PyUnicode_WSTR(unicode) != NULL);
742
743 /* check for integer overflow */
744 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
745 PyErr_NoMemory();
746 return -1;
747 }
748 wstr = _PyUnicode_WSTR(unicode);
749 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
750 if (!wstr) {
751 PyErr_NoMemory();
752 return -1;
753 }
754 _PyUnicode_WSTR(unicode) = wstr;
755 _PyUnicode_WSTR(unicode)[length] = 0;
756 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200757 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758 return 0;
759}
760
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761static PyObject*
762resize_copy(PyObject *unicode, Py_ssize_t length)
763{
764 Py_ssize_t copy_length;
765 if (PyUnicode_IS_COMPACT(unicode)) {
766 PyObject *copy;
767 assert(PyUnicode_IS_READY(unicode));
768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200779 assert(_PyUnicode_WSTR(unicode) != NULL);
780 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200781 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200782 if (w == NULL)
783 return NULL;
784 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
785 copy_length = Py_MIN(copy_length, length);
786 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
787 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
790}
791
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000793 Ux0000 terminated; some code (e.g. new_identifier)
794 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795
796 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000797 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798
799*/
800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200802static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803#endif
804
Alexander Belopolsky40018472011-02-26 01:02:56 +0000805static PyUnicodeObject *
806_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807{
808 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000810
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 if (length == 0 && unicode_empty != NULL) {
813 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200814 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815 }
816
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000817 /* Ensure we won't overflow the size. */
818 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
819 return (PyUnicodeObject *)PyErr_NoMemory();
820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821 if (length < 0) {
822 PyErr_SetString(PyExc_SystemError,
823 "Negative size passed to _PyUnicode_New");
824 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 }
826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200827#ifdef Py_DEBUG
828 ++unicode_old_new_calls;
829#endif
830
831 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
832 if (unicode == NULL)
833 return NULL;
834 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
835 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
836 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
838 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000863
Benjamin Peterson29060642009-01-31 22:14:21 +0000864 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000865 /* XXX UNREF/NEWREF interface should be more symmetrical */
866 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000867 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000868 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870}
871
Victor Stinnerf42dc442011-10-02 23:33:16 +0200872static const char*
873unicode_kind_name(PyObject *unicode)
874{
Victor Stinner42dfd712011-10-03 14:41:45 +0200875 /* don't check consistency: unicode_kind_name() is called from
876 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 if (!PyUnicode_IS_COMPACT(unicode))
878 {
879 if (!PyUnicode_IS_READY(unicode))
880 return "wstr";
881 switch(PyUnicode_KIND(unicode))
882 {
883 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200884 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 return "legacy ascii";
886 else
887 return "legacy latin1";
888 case PyUnicode_2BYTE_KIND:
889 return "legacy UCS2";
890 case PyUnicode_4BYTE_KIND:
891 return "legacy UCS4";
892 default:
893 return "<legacy invalid kind>";
894 }
895 }
896 assert(PyUnicode_IS_READY(unicode));
897 switch(PyUnicode_KIND(unicode))
898 {
899 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 return "ascii";
902 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200903 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200904 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 default:
909 return "<invalid compact kind>";
910 }
911}
912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200914static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915
916/* Functions wrapping macros for use in debugger */
917char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200918 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919}
920
921void *_PyUnicode_compact_data(void *unicode) {
922 return _PyUnicode_COMPACT_DATA(unicode);
923}
924void *_PyUnicode_data(void *unicode){
925 printf("obj %p\n", unicode);
926 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
927 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
928 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
929 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
930 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
931 return PyUnicode_DATA(unicode);
932}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200933
934void
935_PyUnicode_Dump(PyObject *op)
936{
937 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200938 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
939 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
940 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200941
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 {
944 if (ascii->state.ascii)
945 data = (ascii + 1);
946 else
947 data = (compact + 1);
948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 else
950 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200951 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
952
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 if (ascii->wstr == data)
954 printf("shared ");
955 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200956
Victor Stinnera3b334d2011-10-03 13:53:37 +0200957 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 printf(" (%zu), ", compact->wstr_length);
959 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
960 printf("shared ");
961 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200962 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200963 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965#endif
966
967PyObject *
968PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
969{
970 PyObject *obj;
971 PyCompactUnicodeObject *unicode;
972 void *data;
973 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 Py_ssize_t char_size;
976 Py_ssize_t struct_size;
977
978 /* Optimization for empty strings */
979 if (size == 0 && unicode_empty != NULL) {
980 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200981 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 }
983
984#ifdef Py_DEBUG
985 ++unicode_new_new_calls;
986#endif
987
Victor Stinner9e9d6892011-10-04 01:02:02 +0200988 is_ascii = 0;
989 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 struct_size = sizeof(PyCompactUnicodeObject);
991 if (maxchar < 128) {
992 kind_state = PyUnicode_1BYTE_KIND;
993 char_size = 1;
994 is_ascii = 1;
995 struct_size = sizeof(PyASCIIObject);
996 }
997 else if (maxchar < 256) {
998 kind_state = PyUnicode_1BYTE_KIND;
999 char_size = 1;
1000 }
1001 else if (maxchar < 65536) {
1002 kind_state = PyUnicode_2BYTE_KIND;
1003 char_size = 2;
1004 if (sizeof(wchar_t) == 2)
1005 is_sharing = 1;
1006 }
1007 else {
1008 kind_state = PyUnicode_4BYTE_KIND;
1009 char_size = 4;
1010 if (sizeof(wchar_t) == 4)
1011 is_sharing = 1;
1012 }
1013
1014 /* Ensure we won't overflow the size. */
1015 if (size < 0) {
1016 PyErr_SetString(PyExc_SystemError,
1017 "Negative size passed to PyUnicode_New");
1018 return NULL;
1019 }
1020 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1021 return PyErr_NoMemory();
1022
1023 /* Duplicated allocation code from _PyObject_New() instead of a call to
1024 * PyObject_New() so we are able to allocate space for the object and
1025 * it's data buffer.
1026 */
1027 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1028 if (obj == NULL)
1029 return PyErr_NoMemory();
1030 obj = PyObject_INIT(obj, &PyUnicode_Type);
1031 if (obj == NULL)
1032 return NULL;
1033
1034 unicode = (PyCompactUnicodeObject *)obj;
1035 if (is_ascii)
1036 data = ((PyASCIIObject*)obj) + 1;
1037 else
1038 data = unicode + 1;
1039 _PyUnicode_LENGTH(unicode) = size;
1040 _PyUnicode_HASH(unicode) = -1;
1041 _PyUnicode_STATE(unicode).interned = 0;
1042 _PyUnicode_STATE(unicode).kind = kind_state;
1043 _PyUnicode_STATE(unicode).compact = 1;
1044 _PyUnicode_STATE(unicode).ready = 1;
1045 _PyUnicode_STATE(unicode).ascii = is_ascii;
1046 if (is_ascii) {
1047 ((char*)data)[size] = 0;
1048 _PyUnicode_WSTR(unicode) = NULL;
1049 }
1050 else if (kind_state == PyUnicode_1BYTE_KIND) {
1051 ((char*)data)[size] = 0;
1052 _PyUnicode_WSTR(unicode) = NULL;
1053 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001055 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 }
1057 else {
1058 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001059 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 if (kind_state == PyUnicode_2BYTE_KIND)
1061 ((Py_UCS2*)data)[size] = 0;
1062 else /* kind_state == PyUnicode_4BYTE_KIND */
1063 ((Py_UCS4*)data)[size] = 0;
1064 if (is_sharing) {
1065 _PyUnicode_WSTR_LENGTH(unicode) = size;
1066 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1067 }
1068 else {
1069 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1070 _PyUnicode_WSTR(unicode) = NULL;
1071 }
1072 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001073 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 return obj;
1075}
1076
1077#if SIZEOF_WCHAR_T == 2
1078/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1079 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001080 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081
1082 This function assumes that unicode can hold one more code point than wstr
1083 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001084static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001086 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087{
1088 const wchar_t *iter;
1089 Py_UCS4 *ucs4_out;
1090
Victor Stinner910337b2011-10-03 03:20:16 +02001091 assert(unicode != NULL);
1092 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1094 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1095
1096 for (iter = begin; iter < end; ) {
1097 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001099 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1100 && (iter+1) < end
1101 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102 {
Victor Stinner551ac952011-11-29 22:58:13 +01001103 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104 iter += 2;
1105 }
1106 else {
1107 *ucs4_out++ = *iter;
1108 iter++;
1109 }
1110 }
1111 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
1113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114}
1115#endif
1116
Victor Stinnercd9950f2011-10-02 00:34:53 +02001117static int
1118_PyUnicode_Dirty(PyObject *unicode)
1119{
Victor Stinner910337b2011-10-03 03:20:16 +02001120 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001122 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001123 "Cannot modify a string having more than 1 reference");
1124 return -1;
1125 }
1126 _PyUnicode_DIRTY(unicode);
1127 return 0;
1128}
1129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130static int
1131_copy_characters(PyObject *to, Py_ssize_t to_start,
1132 PyObject *from, Py_ssize_t from_start,
1133 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 unsigned int from_kind, to_kind;
1136 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001137 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001139 assert(PyUnicode_Check(from));
1140 assert(PyUnicode_Check(to));
1141 assert(PyUnicode_IS_READY(from));
1142 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1145 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1146 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001148 if (how_many == 0)
1149 return 0;
1150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001154 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156#ifdef Py_DEBUG
1157 if (!check_maxchar
1158 && (from_kind > to_kind
1159 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001160 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1162 Py_UCS4 ch;
1163 Py_ssize_t i;
1164 for (i=0; i < how_many; i++) {
1165 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1166 assert(ch <= to_maxchar);
1167 }
1168 }
1169#endif
1170 fast = (from_kind == to_kind);
1171 if (check_maxchar
1172 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1173 {
1174 /* deny latin1 => ascii */
1175 fast = 0;
1176 }
1177
1178 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001179 Py_MEMCPY((char*)to_data + to_kind * to_start,
1180 (char*)from_data + from_kind * from_start,
1181 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001183 else if (from_kind == PyUnicode_1BYTE_KIND
1184 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 {
1186 _PyUnicode_CONVERT_BYTES(
1187 Py_UCS1, Py_UCS2,
1188 PyUnicode_1BYTE_DATA(from) + from_start,
1189 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1190 PyUnicode_2BYTE_DATA(to) + to_start
1191 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001193 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 && to_kind == PyUnicode_4BYTE_KIND)
1195 {
1196 _PyUnicode_CONVERT_BYTES(
1197 Py_UCS1, Py_UCS4,
1198 PyUnicode_1BYTE_DATA(from) + from_start,
1199 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1200 PyUnicode_4BYTE_DATA(to) + to_start
1201 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001202 }
1203 else if (from_kind == PyUnicode_2BYTE_KIND
1204 && to_kind == PyUnicode_4BYTE_KIND)
1205 {
1206 _PyUnicode_CONVERT_BYTES(
1207 Py_UCS2, Py_UCS4,
1208 PyUnicode_2BYTE_DATA(from) + from_start,
1209 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1210 PyUnicode_4BYTE_DATA(to) + to_start
1211 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001212 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 /* check if max_char(from substring) <= max_char(to) */
1215 if (from_kind > to_kind
1216 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001217 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001218 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 /* slow path to check for character overflow */
1220 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001221 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 Py_ssize_t i;
1223
Victor Stinner56c161a2011-10-06 02:47:11 +02001224#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 for (i=0; i < how_many; i++) {
1226 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001227 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001230#else
1231 if (!check_maxchar) {
1232 for (i=0; i < how_many; i++) {
1233 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1234 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1235 }
1236 }
1237 else {
1238 for (i=0; i < how_many; i++) {
1239 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1240 if (ch > to_maxchar)
1241 return 1;
1242 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1243 }
1244 }
1245#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001246 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001247 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001248 assert(0 && "inconsistent state");
1249 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001250 }
1251 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001252 return 0;
1253}
1254
1255static void
1256copy_characters(PyObject *to, Py_ssize_t to_start,
1257 PyObject *from, Py_ssize_t from_start,
1258 Py_ssize_t how_many)
1259{
1260 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1261}
1262
1263Py_ssize_t
1264PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1265 PyObject *from, Py_ssize_t from_start,
1266 Py_ssize_t how_many)
1267{
1268 int err;
1269
1270 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1271 PyErr_BadInternalCall();
1272 return -1;
1273 }
1274
1275 if (PyUnicode_READY(from))
1276 return -1;
1277 if (PyUnicode_READY(to))
1278 return -1;
1279
1280 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1281 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1282 PyErr_Format(PyExc_SystemError,
1283 "Cannot write %zi characters at %zi "
1284 "in a string of %zi characters",
1285 how_many, to_start, PyUnicode_GET_LENGTH(to));
1286 return -1;
1287 }
1288
1289 if (how_many == 0)
1290 return 0;
1291
1292 if (_PyUnicode_Dirty(to))
1293 return -1;
1294
1295 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1296 if (err) {
1297 PyErr_Format(PyExc_SystemError,
1298 "Cannot copy %s characters "
1299 "into a string of %s characters",
1300 unicode_kind_name(from),
1301 unicode_kind_name(to));
1302 return -1;
1303 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001304 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305}
1306
Victor Stinner17222162011-09-28 22:15:37 +02001307/* Find the maximum code point and count the number of surrogate pairs so a
1308 correct string length can be computed before converting a string to UCS4.
1309 This function counts single surrogates as a character and not as a pair.
1310
1311 Return 0 on success, or -1 on error. */
1312static int
1313find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1314 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315{
1316 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001317 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001318
Victor Stinnerc53be962011-10-02 21:33:54 +02001319 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 *num_surrogates = 0;
1321 *maxchar = 0;
1322
1323 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001325 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1326 && (iter+1) < end
1327 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001329 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001331 iter += 2;
1332 }
1333 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001335 {
1336 ch = *iter;
1337 iter++;
1338 }
1339 if (ch > *maxchar) {
1340 *maxchar = ch;
1341 if (*maxchar > MAX_UNICODE) {
1342 PyErr_Format(PyExc_ValueError,
1343 "character U+%x is not in range [U+0000; U+10ffff]",
1344 ch);
1345 return -1;
1346 }
1347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348 }
1349 return 0;
1350}
1351
1352#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001353static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354#endif
1355
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001356int
1357_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358{
1359 wchar_t *end;
1360 Py_UCS4 maxchar = 0;
1361 Py_ssize_t num_surrogates;
1362#if SIZEOF_WCHAR_T == 2
1363 Py_ssize_t length_wo_surrogates;
1364#endif
1365
Georg Brandl7597add2011-10-05 16:36:47 +02001366 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001367 strings were created using _PyObject_New() and where no canonical
1368 representation (the str field) has been set yet aka strings
1369 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001370 assert(_PyUnicode_CHECK(unicode));
1371 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001373 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001374 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 /* Actually, it should neither be interned nor be anything else: */
1376 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378#ifdef Py_DEBUG
1379 ++unicode_ready_calls;
1380#endif
1381
1382 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001383 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001384 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386
1387 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1389 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 PyErr_NoMemory();
1391 return -1;
1392 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001393 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 _PyUnicode_WSTR(unicode), end,
1395 PyUnicode_1BYTE_DATA(unicode));
1396 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1397 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1398 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1399 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001400 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001401 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001402 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 }
1404 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001405 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001406 _PyUnicode_UTF8(unicode) = NULL;
1407 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 }
1409 PyObject_FREE(_PyUnicode_WSTR(unicode));
1410 _PyUnicode_WSTR(unicode) = NULL;
1411 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1412 }
1413 /* In this case we might have to convert down from 4-byte native
1414 wchar_t to 2-byte unicode. */
1415 else if (maxchar < 65536) {
1416 assert(num_surrogates == 0 &&
1417 "FindMaxCharAndNumSurrogatePairs() messed up");
1418
Victor Stinner506f5922011-09-28 22:34:18 +02001419#if SIZEOF_WCHAR_T == 2
1420 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1423 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1424 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001425 _PyUnicode_UTF8(unicode) = NULL;
1426 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001427#else
1428 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001429 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001430 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001431 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001432 PyErr_NoMemory();
1433 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 }
Victor Stinner506f5922011-09-28 22:34:18 +02001435 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1436 _PyUnicode_WSTR(unicode), end,
1437 PyUnicode_2BYTE_DATA(unicode));
1438 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1439 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1440 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8(unicode) = NULL;
1442 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001443 PyObject_FREE(_PyUnicode_WSTR(unicode));
1444 _PyUnicode_WSTR(unicode) = NULL;
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1449 else {
1450#if SIZEOF_WCHAR_T == 2
1451 /* in case the native representation is 2-bytes, we need to allocate a
1452 new normalized 4-byte version. */
1453 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1455 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyErr_NoMemory();
1457 return -1;
1458 }
1459 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1460 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001461 _PyUnicode_UTF8(unicode) = NULL;
1462 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001463 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1464 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001465 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 PyObject_FREE(_PyUnicode_WSTR(unicode));
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1469#else
1470 assert(num_surrogates == 0);
1471
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001474 _PyUnicode_UTF8(unicode) = NULL;
1475 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1477#endif
1478 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1479 }
1480 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001481 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 return 0;
1483}
1484
Alexander Belopolsky40018472011-02-26 01:02:56 +00001485static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001486unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487{
Walter Dörwald16807132007-05-25 13:52:07 +00001488 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 case SSTATE_NOT_INTERNED:
1490 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001491
Benjamin Peterson29060642009-01-31 22:14:21 +00001492 case SSTATE_INTERNED_MORTAL:
1493 /* revive dead object temporarily for DelItem */
1494 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 Py_FatalError(
1497 "deletion of interned string failed");
1498 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001499
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 case SSTATE_INTERNED_IMMORTAL:
1501 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001502
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 default:
1504 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001505 }
1506
Victor Stinner03490912011-10-03 23:45:12 +02001507 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001508 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001509 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001510 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511
1512 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001513 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514 }
1515 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001516 if (_PyUnicode_DATA_ANY(unicode))
1517 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001518 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 }
1520}
1521
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001522#ifdef Py_DEBUG
1523static int
1524unicode_is_singleton(PyObject *unicode)
1525{
1526 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1527 if (unicode == unicode_empty)
1528 return 1;
1529 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1530 {
1531 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1532 if (ch < 256 && unicode_latin1[ch] == unicode)
1533 return 1;
1534 }
1535 return 0;
1536}
1537#endif
1538
Alexander Belopolsky40018472011-02-26 01:02:56 +00001539static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001541{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 if (Py_REFCNT(unicode) != 1)
1543 return 0;
1544 if (PyUnicode_CHECK_INTERNED(unicode))
1545 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001546#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547 /* singleton refcount is greater than 1 */
1548 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001549#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 return 1;
1551}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001552
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553static int
1554unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1555{
1556 PyObject *unicode;
1557 Py_ssize_t old_length;
1558
1559 assert(p_unicode != NULL);
1560 unicode = *p_unicode;
1561
1562 assert(unicode != NULL);
1563 assert(PyUnicode_Check(unicode));
1564 assert(0 <= length);
1565
Victor Stinner910337b2011-10-03 03:20:16 +02001566 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567 old_length = PyUnicode_WSTR_LENGTH(unicode);
1568 else
1569 old_length = PyUnicode_GET_LENGTH(unicode);
1570 if (old_length == length)
1571 return 0;
1572
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001573 if (length == 0) {
1574 Py_DECREF(*p_unicode);
1575 *p_unicode = unicode_empty;
1576 Py_INCREF(*p_unicode);
1577 return 0;
1578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (!unicode_resizable(unicode)) {
1581 PyObject *copy = resize_copy(unicode, length);
1582 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 Py_DECREF(*p_unicode);
1585 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587 }
1588
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 if (PyUnicode_IS_COMPACT(unicode)) {
1590 *p_unicode = resize_compact(unicode, length);
1591 if (*p_unicode == NULL)
1592 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001593 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001595 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001596 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001597}
1598
Alexander Belopolsky40018472011-02-26 01:02:56 +00001599int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001600PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001601{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001602 PyObject *unicode;
1603 if (p_unicode == NULL) {
1604 PyErr_BadInternalCall();
1605 return -1;
1606 }
1607 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001609 {
1610 PyErr_BadInternalCall();
1611 return -1;
1612 }
1613 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001614}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001615
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001616static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001617unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001618{
1619 PyObject *result;
1620 assert(PyUnicode_IS_READY(*p_unicode));
1621 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1622 return 0;
1623 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1624 maxchar);
1625 if (result == NULL)
1626 return -1;
1627 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1628 PyUnicode_GET_LENGTH(*p_unicode));
1629 Py_DECREF(*p_unicode);
1630 *p_unicode = result;
1631 return 0;
1632}
1633
1634static int
1635unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1636 Py_UCS4 ch)
1637{
1638 if (unicode_widen(p_unicode, ch) < 0)
1639 return -1;
1640 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1641 PyUnicode_DATA(*p_unicode),
1642 (*pos)++, ch);
1643 return 0;
1644}
1645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646static PyObject*
1647get_latin1_char(unsigned char ch)
1648{
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 if (!unicode)
1653 return NULL;
1654 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001655 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 unicode_latin1[ch] = unicode;
1657 }
1658 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001659 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660}
1661
Alexander Belopolsky40018472011-02-26 01:02:56 +00001662PyObject *
1663PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001664{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001665 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 Py_UCS4 maxchar = 0;
1667 Py_ssize_t num_surrogates;
1668
1669 if (u == NULL)
1670 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001671
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001672 /* If the Unicode data is known at construction time, we can apply
1673 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001675 /* Optimization for empty strings */
1676 if (size == 0 && unicode_empty != NULL) {
1677 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001678 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001679 }
Tim Petersced69f82003-09-16 20:30:58 +00001680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 /* Single character Unicode objects in the Latin-1 range are
1682 shared when using this constructor */
1683 if (size == 1 && *u < 256)
1684 return get_latin1_char((unsigned char)*u);
1685
1686 /* If not empty and not single character, copy the Unicode data
1687 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001688 if (find_maxchar_surrogates(u, u + size,
1689 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 return NULL;
1691
Victor Stinner8faf8212011-12-08 22:14:11 +01001692 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 if (!unicode)
1694 return NULL;
1695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696 switch (PyUnicode_KIND(unicode)) {
1697 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001698 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1700 break;
1701 case PyUnicode_2BYTE_KIND:
1702#if Py_UNICODE_SIZE == 2
1703 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1704#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001705 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1707#endif
1708 break;
1709 case PyUnicode_4BYTE_KIND:
1710#if SIZEOF_WCHAR_T == 2
1711 /* This is the only case which has to process surrogates, thus
1712 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001713 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714#else
1715 assert(num_surrogates == 0);
1716 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1717#endif
1718 break;
1719 default:
1720 assert(0 && "Impossible state");
1721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001723 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724}
1725
Alexander Belopolsky40018472011-02-26 01:02:56 +00001726PyObject *
1727PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001728{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001729 if (size < 0) {
1730 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001731 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001732 return NULL;
1733 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001734 if (u != NULL)
1735 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1736 else
1737 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001738}
1739
Alexander Belopolsky40018472011-02-26 01:02:56 +00001740PyObject *
1741PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001742{
1743 size_t size = strlen(u);
1744 if (size > PY_SSIZE_T_MAX) {
1745 PyErr_SetString(PyExc_OverflowError, "input too long");
1746 return NULL;
1747 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001748 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001749}
1750
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001751PyObject *
1752_PyUnicode_FromId(_Py_Identifier *id)
1753{
1754 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001755 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1756 strlen(id->string),
1757 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001758 if (!id->object)
1759 return NULL;
1760 PyUnicode_InternInPlace(&id->object);
1761 assert(!id->next);
1762 id->next = static_strings;
1763 static_strings = id;
1764 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001765 return id->object;
1766}
1767
1768void
1769_PyUnicode_ClearStaticStrings()
1770{
1771 _Py_Identifier *i;
1772 for (i = static_strings; i; i = i->next) {
1773 Py_DECREF(i->object);
1774 i->object = NULL;
1775 i->next = NULL;
1776 }
1777}
1778
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001779/* Internal function, don't check maximum character */
1780
Victor Stinnere57b1c02011-09-28 22:20:48 +02001781static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001782unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001783{
Victor Stinner785938e2011-12-11 20:09:03 +01001784 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001785 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001786#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001787 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001788#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001790 }
Victor Stinner785938e2011-12-11 20:09:03 +01001791 unicode = PyUnicode_New(size, 127);
1792 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001793 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001794 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1795 assert(_PyUnicode_CheckConsistency(unicode, 1));
1796 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001797}
1798
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001799static Py_UCS4
1800kind_maxchar_limit(unsigned int kind)
1801{
1802 switch(kind) {
1803 case PyUnicode_1BYTE_KIND:
1804 return 0x80;
1805 case PyUnicode_2BYTE_KIND:
1806 return 0x100;
1807 case PyUnicode_4BYTE_KIND:
1808 return 0x10000;
1809 default:
1810 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001811 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001812 }
1813}
1814
Victor Stinner702c7342011-10-05 13:50:52 +02001815static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001816_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001819 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001820
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001821 if (size == 0) {
1822 Py_INCREF(unicode_empty);
1823 return unicode_empty;
1824 }
1825 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001826 if (size == 1)
1827 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001828
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001829 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 if (!res)
1832 return NULL;
1833 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001834 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001836}
1837
Victor Stinnere57b1c02011-09-28 22:20:48 +02001838static PyObject*
1839_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840{
1841 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001842 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001843
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001844 if (size == 0) {
1845 Py_INCREF(unicode_empty);
1846 return unicode_empty;
1847 }
1848 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001849 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001850 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001851
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001852 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854 if (!res)
1855 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001856 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001857 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001858 else {
1859 _PyUnicode_CONVERT_BYTES(
1860 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1861 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001862 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 return res;
1864}
1865
Victor Stinnere57b1c02011-09-28 22:20:48 +02001866static PyObject*
1867_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868{
1869 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001871
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001872 if (size == 0) {
1873 Py_INCREF(unicode_empty);
1874 return unicode_empty;
1875 }
1876 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001877 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001878 return get_latin1_char((unsigned char)u[0]);
1879
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001880 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 if (!res)
1883 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001884 if (max_char < 256)
1885 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1886 PyUnicode_1BYTE_DATA(res));
1887 else if (max_char < 0x10000)
1888 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1889 PyUnicode_2BYTE_DATA(res));
1890 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001892 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 return res;
1894}
1895
1896PyObject*
1897PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1898{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001899 if (size < 0) {
1900 PyErr_SetString(PyExc_ValueError, "size must be positive");
1901 return NULL;
1902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 switch(kind) {
1904 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001905 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001907 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001909 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 PyErr_SetString(PyExc_SystemError, "invalid kind");
1912 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914}
1915
Victor Stinner25a4b292011-10-06 12:31:55 +02001916/* Ensure that a string uses the most efficient storage, if it is not the
1917 case: create a new string with of the right kind. Write NULL into *p_unicode
1918 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001919static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001920unicode_adjust_maxchar(PyObject **p_unicode)
1921{
1922 PyObject *unicode, *copy;
1923 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001924 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001925 unsigned int kind;
1926
1927 assert(p_unicode != NULL);
1928 unicode = *p_unicode;
1929 assert(PyUnicode_IS_READY(unicode));
1930 if (PyUnicode_IS_ASCII(unicode))
1931 return;
1932
1933 len = PyUnicode_GET_LENGTH(unicode);
1934 kind = PyUnicode_KIND(unicode);
1935 if (kind == PyUnicode_1BYTE_KIND) {
1936 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + len);
1938 if (max_char >= 128)
1939 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 }
1941 else if (kind == PyUnicode_2BYTE_KIND) {
1942 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 max_char = ucs2lib_find_max_char(u, u + len);
1944 if (max_char >= 256)
1945 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 }
1947 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001949 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs4lib_find_max_char(u, u + len);
1951 if (max_char >= 0x10000)
1952 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001953 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001954 copy = PyUnicode_New(len, max_char);
1955 copy_characters(copy, 0, unicode, 0, len);
1956 Py_DECREF(unicode);
1957 *p_unicode = copy;
1958}
1959
Victor Stinner034f6cf2011-09-30 02:26:44 +02001960PyObject*
1961PyUnicode_Copy(PyObject *unicode)
1962{
Victor Stinner87af4f22011-11-21 23:03:47 +01001963 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001965
Victor Stinner034f6cf2011-09-30 02:26:44 +02001966 if (!PyUnicode_Check(unicode)) {
1967 PyErr_BadInternalCall();
1968 return NULL;
1969 }
1970 if (PyUnicode_READY(unicode))
1971 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001972
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 length = PyUnicode_GET_LENGTH(unicode);
1974 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975 if (!copy)
1976 return NULL;
1977 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1978
Victor Stinner87af4f22011-11-21 23:03:47 +01001979 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1980 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001981 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001983}
1984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985
Victor Stinnerbc603d12011-10-02 01:00:40 +02001986/* Widen Unicode objects to larger buffers. Don't write terminating null
1987 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988
1989void*
1990_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1991{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992 Py_ssize_t len;
1993 void *result;
1994 unsigned int skind;
1995
1996 if (PyUnicode_READY(s))
1997 return NULL;
1998
1999 len = PyUnicode_GET_LENGTH(s);
2000 skind = PyUnicode_KIND(s);
2001 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002002 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 return NULL;
2004 }
2005 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002006 case PyUnicode_2BYTE_KIND:
2007 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2008 if (!result)
2009 return PyErr_NoMemory();
2010 assert(skind == PyUnicode_1BYTE_KIND);
2011 _PyUnicode_CONVERT_BYTES(
2012 Py_UCS1, Py_UCS2,
2013 PyUnicode_1BYTE_DATA(s),
2014 PyUnicode_1BYTE_DATA(s) + len,
2015 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002017 case PyUnicode_4BYTE_KIND:
2018 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2019 if (!result)
2020 return PyErr_NoMemory();
2021 if (skind == PyUnicode_2BYTE_KIND) {
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS2, Py_UCS4,
2024 PyUnicode_2BYTE_DATA(s),
2025 PyUnicode_2BYTE_DATA(s) + len,
2026 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002028 else {
2029 assert(skind == PyUnicode_1BYTE_KIND);
2030 _PyUnicode_CONVERT_BYTES(
2031 Py_UCS1, Py_UCS4,
2032 PyUnicode_1BYTE_DATA(s),
2033 PyUnicode_1BYTE_DATA(s) + len,
2034 result);
2035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002037 default:
2038 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 }
Victor Stinner01698042011-10-04 00:04:26 +02002040 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 return NULL;
2042}
2043
2044static Py_UCS4*
2045as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2046 int copy_null)
2047{
2048 int kind;
2049 void *data;
2050 Py_ssize_t len, targetlen;
2051 if (PyUnicode_READY(string) == -1)
2052 return NULL;
2053 kind = PyUnicode_KIND(string);
2054 data = PyUnicode_DATA(string);
2055 len = PyUnicode_GET_LENGTH(string);
2056 targetlen = len;
2057 if (copy_null)
2058 targetlen++;
2059 if (!target) {
2060 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2061 PyErr_NoMemory();
2062 return NULL;
2063 }
2064 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2065 if (!target) {
2066 PyErr_NoMemory();
2067 return NULL;
2068 }
2069 }
2070 else {
2071 if (targetsize < targetlen) {
2072 PyErr_Format(PyExc_SystemError,
2073 "string is longer than the buffer");
2074 if (copy_null && 0 < targetsize)
2075 target[0] = 0;
2076 return NULL;
2077 }
2078 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002079 if (kind == PyUnicode_1BYTE_KIND) {
2080 Py_UCS1 *start = (Py_UCS1 *) data;
2081 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002082 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002083 else if (kind == PyUnicode_2BYTE_KIND) {
2084 Py_UCS2 *start = (Py_UCS2 *) data;
2085 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2086 }
2087 else {
2088 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (copy_null)
2092 target[len] = 0;
2093 return target;
2094}
2095
2096Py_UCS4*
2097PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2098 int copy_null)
2099{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002100 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 PyErr_BadInternalCall();
2102 return NULL;
2103 }
2104 return as_ucs4(string, target, targetsize, copy_null);
2105}
2106
2107Py_UCS4*
2108PyUnicode_AsUCS4Copy(PyObject *string)
2109{
2110 return as_ucs4(string, NULL, 0, 1);
2111}
2112
2113#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002114
Alexander Belopolsky40018472011-02-26 01:02:56 +00002115PyObject *
2116PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002119 if (size == 0) {
2120 Py_INCREF(unicode_empty);
2121 return unicode_empty;
2122 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002123 PyErr_BadInternalCall();
2124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 }
2126
Martin v. Löwis790465f2008-04-05 20:41:37 +00002127 if (size == -1) {
2128 size = wcslen(w);
2129 }
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132}
2133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002135
Walter Dörwald346737f2007-05-31 10:44:43 +00002136static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002137makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2138 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002139{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002140 *fmt++ = '%';
2141 if (width) {
2142 if (zeropad)
2143 *fmt++ = '0';
2144 fmt += sprintf(fmt, "%d", width);
2145 }
2146 if (precision)
2147 fmt += sprintf(fmt, ".%d", precision);
2148 if (longflag)
2149 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002150 else if (longlongflag) {
2151 /* longlongflag should only ever be nonzero on machines with
2152 HAVE_LONG_LONG defined */
2153#ifdef HAVE_LONG_LONG
2154 char *f = PY_FORMAT_LONG_LONG;
2155 while (*f)
2156 *fmt++ = *f++;
2157#else
2158 /* we shouldn't ever get here */
2159 assert(0);
2160 *fmt++ = 'l';
2161#endif
2162 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 else if (size_tflag) {
2164 char *f = PY_FORMAT_SIZE_T;
2165 while (*f)
2166 *fmt++ = *f++;
2167 }
2168 *fmt++ = c;
2169 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002170}
2171
Victor Stinner96865452011-03-01 23:44:09 +00002172/* helper for PyUnicode_FromFormatV() */
2173
2174static const char*
2175parse_format_flags(const char *f,
2176 int *p_width, int *p_precision,
2177 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2178{
2179 int width, precision, longflag, longlongflag, size_tflag;
2180
2181 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2182 f++;
2183 width = 0;
2184 while (Py_ISDIGIT((unsigned)*f))
2185 width = (width*10) + *f++ - '0';
2186 precision = 0;
2187 if (*f == '.') {
2188 f++;
2189 while (Py_ISDIGIT((unsigned)*f))
2190 precision = (precision*10) + *f++ - '0';
2191 if (*f == '%') {
2192 /* "%.3%s" => f points to "3" */
2193 f--;
2194 }
2195 }
2196 if (*f == '\0') {
2197 /* bogus format "%.1" => go backward, f points to "1" */
2198 f--;
2199 }
2200 if (p_width != NULL)
2201 *p_width = width;
2202 if (p_precision != NULL)
2203 *p_precision = precision;
2204
2205 /* Handle %ld, %lu, %lld and %llu. */
2206 longflag = 0;
2207 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002208 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002209
2210 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002211 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002212 longflag = 1;
2213 ++f;
2214 }
2215#ifdef HAVE_LONG_LONG
2216 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002217 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002218 longlongflag = 1;
2219 f += 2;
2220 }
2221#endif
2222 }
2223 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002224 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002225 size_tflag = 1;
2226 ++f;
2227 }
2228 if (p_longflag != NULL)
2229 *p_longflag = longflag;
2230 if (p_longlongflag != NULL)
2231 *p_longlongflag = longlongflag;
2232 if (p_size_tflag != NULL)
2233 *p_size_tflag = size_tflag;
2234 return f;
2235}
2236
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002237/* maximum number of characters required for output of %ld. 21 characters
2238 allows for 64-bit integers (in decimal) and an optional sign. */
2239#define MAX_LONG_CHARS 21
2240/* maximum number of characters required for output of %lld.
2241 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2242 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2243#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2244
Walter Dörwaldd2034312007-05-18 16:29:38 +00002245PyObject *
2246PyUnicode_FromFormatV(const char *format, va_list vargs)
2247{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 va_list count;
2249 Py_ssize_t callcount = 0;
2250 PyObject **callresults = NULL;
2251 PyObject **callresult = NULL;
2252 Py_ssize_t n = 0;
2253 int width = 0;
2254 int precision = 0;
2255 int zeropad;
2256 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002257 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002258 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002259 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2261 Py_UCS4 argmaxchar;
2262 Py_ssize_t numbersize = 0;
2263 char *numberresults = NULL;
2264 char *numberresult = NULL;
2265 Py_ssize_t i;
2266 int kind;
2267 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002268
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002269 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002270 /* step 1: count the number of %S/%R/%A/%s format specifications
2271 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2272 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002274 * also estimate a upper bound for all the number formats in the string,
2275 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002277 for (f = format; *f; f++) {
2278 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002279 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2281 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2282 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2283 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002286#ifdef HAVE_LONG_LONG
2287 if (longlongflag) {
2288 if (width < MAX_LONG_LONG_CHARS)
2289 width = MAX_LONG_LONG_CHARS;
2290 }
2291 else
2292#endif
2293 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2294 including sign. Decimal takes the most space. This
2295 isn't enough for octal. If a width is specified we
2296 need more (which we allocate later). */
2297 if (width < MAX_LONG_CHARS)
2298 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002299
2300 /* account for the size + '\0' to separate numbers
2301 inside of the numberresults buffer */
2302 numbersize += (width + 1);
2303 }
2304 }
2305 else if ((unsigned char)*f > 127) {
2306 PyErr_Format(PyExc_ValueError,
2307 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2308 "string, got a non-ASCII byte: 0x%02x",
2309 (unsigned char)*f);
2310 return NULL;
2311 }
2312 }
2313 /* step 2: allocate memory for the results of
2314 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2315 if (callcount) {
2316 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2317 if (!callresults) {
2318 PyErr_NoMemory();
2319 return NULL;
2320 }
2321 callresult = callresults;
2322 }
2323 /* step 2.5: allocate memory for the results of formating numbers */
2324 if (numbersize) {
2325 numberresults = PyObject_Malloc(numbersize);
2326 if (!numberresults) {
2327 PyErr_NoMemory();
2328 goto fail;
2329 }
2330 numberresult = numberresults;
2331 }
2332
2333 /* step 3: format numbers and figure out how large a buffer we need */
2334 for (f = format; *f; f++) {
2335 if (*f == '%') {
2336 const char* p;
2337 int longflag;
2338 int longlongflag;
2339 int size_tflag;
2340 int numprinted;
2341
2342 p = f;
2343 zeropad = (f[1] == '0');
2344 f = parse_format_flags(f, &width, &precision,
2345 &longflag, &longlongflag, &size_tflag);
2346 switch (*f) {
2347 case 'c':
2348 {
2349 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002350 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 n++;
2352 break;
2353 }
2354 case '%':
2355 n++;
2356 break;
2357 case 'i':
2358 case 'd':
2359 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2360 width, precision, *f);
2361 if (longflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, long));
2364#ifdef HAVE_LONG_LONG
2365 else if (longlongflag)
2366 numprinted = sprintf(numberresult, fmt,
2367 va_arg(count, PY_LONG_LONG));
2368#endif
2369 else if (size_tflag)
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, Py_ssize_t));
2372 else
2373 numprinted = sprintf(numberresult, fmt,
2374 va_arg(count, int));
2375 n += numprinted;
2376 /* advance by +1 to skip over the '\0' */
2377 numberresult += (numprinted + 1);
2378 assert(*(numberresult - 1) == '\0');
2379 assert(*(numberresult - 2) != '\0');
2380 assert(numprinted >= 0);
2381 assert(numberresult <= numberresults + numbersize);
2382 break;
2383 case 'u':
2384 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2385 width, precision, 'u');
2386 if (longflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, unsigned long));
2389#ifdef HAVE_LONG_LONG
2390 else if (longlongflag)
2391 numprinted = sprintf(numberresult, fmt,
2392 va_arg(count, unsigned PY_LONG_LONG));
2393#endif
2394 else if (size_tflag)
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, size_t));
2397 else
2398 numprinted = sprintf(numberresult, fmt,
2399 va_arg(count, unsigned int));
2400 n += numprinted;
2401 numberresult += (numprinted + 1);
2402 assert(*(numberresult - 1) == '\0');
2403 assert(*(numberresult - 2) != '\0');
2404 assert(numprinted >= 0);
2405 assert(numberresult <= numberresults + numbersize);
2406 break;
2407 case 'x':
2408 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2409 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2410 n += numprinted;
2411 numberresult += (numprinted + 1);
2412 assert(*(numberresult - 1) == '\0');
2413 assert(*(numberresult - 2) != '\0');
2414 assert(numprinted >= 0);
2415 assert(numberresult <= numberresults + numbersize);
2416 break;
2417 case 'p':
2418 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2419 /* %p is ill-defined: ensure leading 0x. */
2420 if (numberresult[1] == 'X')
2421 numberresult[1] = 'x';
2422 else if (numberresult[1] != 'x') {
2423 memmove(numberresult + 2, numberresult,
2424 strlen(numberresult) + 1);
2425 numberresult[0] = '0';
2426 numberresult[1] = 'x';
2427 numprinted += 2;
2428 }
2429 n += numprinted;
2430 numberresult += (numprinted + 1);
2431 assert(*(numberresult - 1) == '\0');
2432 assert(*(numberresult - 2) != '\0');
2433 assert(numprinted >= 0);
2434 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 break;
2436 case 's':
2437 {
2438 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002439 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002440 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002441 if (!str)
2442 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 /* since PyUnicode_DecodeUTF8 returns already flexible
2444 unicode objects, there is no need to call ready on them */
2445 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002446 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002448 /* Remember the str and switch to the next slot */
2449 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 break;
2451 }
2452 case 'U':
2453 {
2454 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002455 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 if (PyUnicode_READY(obj) == -1)
2457 goto fail;
2458 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002459 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002461 break;
2462 }
2463 case 'V':
2464 {
2465 PyObject *obj = va_arg(count, PyObject *);
2466 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002467 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002468 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002469 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 if (PyUnicode_READY(obj) == -1)
2472 goto fail;
2473 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002474 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002476 *callresult++ = NULL;
2477 }
2478 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002479 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002480 if (!str_obj)
2481 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002482 if (PyUnicode_READY(str_obj)) {
2483 Py_DECREF(str_obj);
2484 goto fail;
2485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002487 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002489 *callresult++ = str_obj;
2490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 break;
2492 }
2493 case 'S':
2494 {
2495 PyObject *obj = va_arg(count, PyObject *);
2496 PyObject *str;
2497 assert(obj);
2498 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002502 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 /* Remember the str and switch to the next slot */
2505 *callresult++ = str;
2506 break;
2507 }
2508 case 'R':
2509 {
2510 PyObject *obj = va_arg(count, PyObject *);
2511 PyObject *repr;
2512 assert(obj);
2513 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002517 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 /* Remember the repr and switch to the next slot */
2520 *callresult++ = repr;
2521 break;
2522 }
2523 case 'A':
2524 {
2525 PyObject *obj = va_arg(count, PyObject *);
2526 PyObject *ascii;
2527 assert(obj);
2528 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002532 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 /* Remember the repr and switch to the next slot */
2535 *callresult++ = ascii;
2536 break;
2537 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 default:
2539 /* if we stumble upon an unknown
2540 formatting code, copy the rest of
2541 the format string to the output
2542 string. (we cannot just skip the
2543 code, since there's no way to know
2544 what's in the argument list) */
2545 n += strlen(p);
2546 goto expand;
2547 }
2548 } else
2549 n++;
2550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002554 we don't have to resize the string.
2555 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002556 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (!string)
2558 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 kind = PyUnicode_KIND(string);
2560 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002566 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002567
2568 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2570 /* checking for == because the last argument could be a empty
2571 string, which causes i to point to end, the assert at the end of
2572 the loop */
2573 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002574
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 switch (*f) {
2576 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002577 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 const int ordinal = va_arg(vargs, int);
2579 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002582 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 case 'p':
2587 /* unused, since we already have the result */
2588 if (*f == 'p')
2589 (void) va_arg(vargs, void *);
2590 else
2591 (void) va_arg(vargs, int);
2592 /* extract the result from numberresults and append. */
2593 for (; *numberresult; ++i, ++numberresult)
2594 PyUnicode_WRITE(kind, data, i, *numberresult);
2595 /* skip over the separating '\0' */
2596 assert(*numberresult == '\0');
2597 numberresult++;
2598 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 case 's':
2601 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002602 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002604 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 size = PyUnicode_GET_LENGTH(*callresult);
2606 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002607 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002609 /* We're done with the unicode()/repr() => forget it */
2610 Py_DECREF(*callresult);
2611 /* switch to next unicode()/repr() result */
2612 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 break;
2614 }
2615 case 'U':
2616 {
2617 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 Py_ssize_t size;
2619 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2620 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002621 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 break;
2624 }
2625 case 'V':
2626 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002629 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 size = PyUnicode_GET_LENGTH(obj);
2632 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002633 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 size = PyUnicode_GET_LENGTH(*callresult);
2637 assert(PyUnicode_KIND(*callresult) <=
2638 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002639 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002641 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002643 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 break;
2645 }
2646 case 'S':
2647 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002648 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002650 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 /* unused, since we already have the result */
2652 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 copy_characters(string, i, *callresult, 0, size);
2655 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* We're done with the unicode()/repr() => forget it */
2657 Py_DECREF(*callresult);
2658 /* switch to next unicode()/repr() result */
2659 ++callresult;
2660 break;
2661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 break;
2665 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 for (; *p; ++p, ++i)
2667 PyUnicode_WRITE(kind, data, i, *p);
2668 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 goto end;
2670 }
Victor Stinner1205f272010-09-11 00:54:47 +00002671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 else {
2673 assert(i < PyUnicode_GET_LENGTH(string));
2674 PyUnicode_WRITE(kind, data, i++, *f);
2675 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002678
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 if (callresults)
2681 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 if (numberresults)
2683 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002684 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 if (callresults) {
2687 PyObject **callresult2 = callresults;
2688 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002689 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 ++callresult2;
2691 }
2692 PyObject_Free(callresults);
2693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 if (numberresults)
2695 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002697}
2698
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699PyObject *
2700PyUnicode_FromFormat(const char *format, ...)
2701{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 PyObject* ret;
2703 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704
2705#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 ret = PyUnicode_FromFormatV(format, vargs);
2711 va_end(vargs);
2712 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713}
2714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715#ifdef HAVE_WCHAR_H
2716
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2718 convert a Unicode object to a wide character string.
2719
Victor Stinnerd88d9832011-09-06 02:00:05 +02002720 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 character) required to convert the unicode object. Ignore size argument.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002725 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002727unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 wchar_t *w,
2729 Py_ssize_t size)
2730{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 const wchar_t *wstr;
2733
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002734 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 if (wstr == NULL)
2736 return -1;
2737
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 if (size > res)
2740 size = res + 1;
2741 else
2742 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 return res;
2745 }
2746 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002748}
2749
2750Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002751PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002752 wchar_t *w,
2753 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754{
2755 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002756 PyErr_BadInternalCall();
2757 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002759 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760}
2761
Victor Stinner137c34c2010-09-29 10:25:54 +00002762wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002763PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 Py_ssize_t *size)
2765{
2766 wchar_t* buffer;
2767 Py_ssize_t buflen;
2768
2769 if (unicode == NULL) {
2770 PyErr_BadInternalCall();
2771 return NULL;
2772 }
2773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002774 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 if (buflen == -1)
2776 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002777 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 PyErr_NoMemory();
2779 return NULL;
2780 }
2781
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2783 if (buffer == NULL) {
2784 PyErr_NoMemory();
2785 return NULL;
2786 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002787 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 if (buflen == -1)
2789 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002790 if (size != NULL)
2791 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002792 return buffer;
2793}
2794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796
Alexander Belopolsky40018472011-02-26 01:02:56 +00002797PyObject *
2798PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002801 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 PyErr_SetString(PyExc_ValueError,
2803 "chr() arg not in range(0x110000)");
2804 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002805 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 if (ordinal < 256)
2808 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 v = PyUnicode_New(1, ordinal);
2811 if (v == NULL)
2812 return NULL;
2813 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002814 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002816}
2817
Alexander Belopolsky40018472011-02-26 01:02:56 +00002818PyObject *
2819PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002821 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002823 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002824 if (PyUnicode_READY(obj))
2825 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 Py_INCREF(obj);
2827 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002828 }
2829 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 /* For a Unicode subtype that's not a Unicode object,
2831 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002832 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002833 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002834 PyErr_Format(PyExc_TypeError,
2835 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002836 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002837 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838}
2839
Alexander Belopolsky40018472011-02-26 01:02:56 +00002840PyObject *
2841PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002842 const char *encoding,
2843 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002847
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 PyErr_BadInternalCall();
2850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002853 /* Decoding bytes objects is the most common case and should be fast */
2854 if (PyBytes_Check(obj)) {
2855 if (PyBytes_GET_SIZE(obj) == 0) {
2856 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002857 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002858 }
2859 else {
2860 v = PyUnicode_Decode(
2861 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2862 encoding, errors);
2863 }
2864 return v;
2865 }
2866
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002867 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002868 PyErr_SetString(PyExc_TypeError,
2869 "decoding str is not supported");
2870 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002871 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002872
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002873 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2874 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2875 PyErr_Format(PyExc_TypeError,
2876 "coercing to str: need bytes, bytearray "
2877 "or buffer-like object, %.80s found",
2878 Py_TYPE(obj)->tp_name);
2879 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002880 }
Tim Petersced69f82003-09-16 20:30:58 +00002881
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002882 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002884 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 }
Tim Petersced69f82003-09-16 20:30:58 +00002886 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002887 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002888
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002890 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891}
2892
Victor Stinner600d3be2010-06-10 12:00:55 +00002893/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002894 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2895 1 on success. */
2896static int
2897normalize_encoding(const char *encoding,
2898 char *lower,
2899 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002902 char *l;
2903 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002904
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002905 if (encoding == NULL) {
2906 strcpy(lower, "utf-8");
2907 return 1;
2908 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 e = encoding;
2910 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002911 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002912 while (*e) {
2913 if (l == l_end)
2914 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002915 if (Py_ISUPPER(*e)) {
2916 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002917 }
2918 else if (*e == '_') {
2919 *l++ = '-';
2920 e++;
2921 }
2922 else {
2923 *l++ = *e++;
2924 }
2925 }
2926 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002927 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002928}
2929
Alexander Belopolsky40018472011-02-26 01:02:56 +00002930PyObject *
2931PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002932 Py_ssize_t size,
2933 const char *encoding,
2934 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002935{
2936 PyObject *buffer = NULL, *unicode;
2937 Py_buffer info;
2938 char lower[11]; /* Enough for any encoding shortcut */
2939
Fred Drakee4315f52000-05-09 19:53:39 +00002940 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002941 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002942 if ((strcmp(lower, "utf-8") == 0) ||
2943 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002944 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002947 (strcmp(lower, "iso-8859-1") == 0))
2948 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002949#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002950 else if (strcmp(lower, "mbcs") == 0)
2951 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002952#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002953 else if (strcmp(lower, "ascii") == 0)
2954 return PyUnicode_DecodeASCII(s, size, errors);
2955 else if (strcmp(lower, "utf-16") == 0)
2956 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2957 else if (strcmp(lower, "utf-32") == 0)
2958 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2959 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002960
2961 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002962 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002963 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002964 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002965 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966 if (buffer == NULL)
2967 goto onError;
2968 unicode = PyCodec_Decode(buffer, encoding, errors);
2969 if (unicode == NULL)
2970 goto onError;
2971 if (!PyUnicode_Check(unicode)) {
2972 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002973 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002974 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975 Py_DECREF(unicode);
2976 goto onError;
2977 }
2978 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002979 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002980
Benjamin Peterson29060642009-01-31 22:14:21 +00002981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 Py_XDECREF(buffer);
2983 return NULL;
2984}
2985
Alexander Belopolsky40018472011-02-26 01:02:56 +00002986PyObject *
2987PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002988 const char *encoding,
2989 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002990{
2991 PyObject *v;
2992
2993 if (!PyUnicode_Check(unicode)) {
2994 PyErr_BadArgument();
2995 goto onError;
2996 }
2997
2998 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000
3001 /* Decode via the codec registry */
3002 v = PyCodec_Decode(unicode, encoding, errors);
3003 if (v == NULL)
3004 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003005 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008 return NULL;
3009}
3010
Alexander Belopolsky40018472011-02-26 01:02:56 +00003011PyObject *
3012PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003013 const char *encoding,
3014 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003015{
3016 PyObject *v;
3017
3018 if (!PyUnicode_Check(unicode)) {
3019 PyErr_BadArgument();
3020 goto onError;
3021 }
3022
3023 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025
3026 /* Decode via the codec registry */
3027 v = PyCodec_Decode(unicode, encoding, errors);
3028 if (v == NULL)
3029 goto onError;
3030 if (!PyUnicode_Check(v)) {
3031 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003032 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003033 Py_TYPE(v)->tp_name);
3034 Py_DECREF(v);
3035 goto onError;
3036 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003037 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003038
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003040 return NULL;
3041}
3042
Alexander Belopolsky40018472011-02-26 01:02:56 +00003043PyObject *
3044PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003045 Py_ssize_t size,
3046 const char *encoding,
3047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048{
3049 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003050
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 unicode = PyUnicode_FromUnicode(s, size);
3052 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3055 Py_DECREF(unicode);
3056 return v;
3057}
3058
Alexander Belopolsky40018472011-02-26 01:02:56 +00003059PyObject *
3060PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003061 const char *encoding,
3062 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003063{
3064 PyObject *v;
3065
3066 if (!PyUnicode_Check(unicode)) {
3067 PyErr_BadArgument();
3068 goto onError;
3069 }
3070
3071 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073
3074 /* Encode via the codec registry */
3075 v = PyCodec_Encode(unicode, encoding, errors);
3076 if (v == NULL)
3077 goto onError;
3078 return v;
3079
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081 return NULL;
3082}
3083
Victor Stinnerad158722010-10-27 00:25:46 +00003084PyObject *
3085PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003086{
Victor Stinner99b95382011-07-04 14:23:54 +02003087#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003088 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003089#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003090 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003091#else
Victor Stinner793b5312011-04-27 00:24:21 +02003092 PyInterpreterState *interp = PyThreadState_GET()->interp;
3093 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3094 cannot use it to encode and decode filenames before it is loaded. Load
3095 the Python codec requires to encode at least its own filename. Use the C
3096 version of the locale codec until the codec registry is initialized and
3097 the Python codec is loaded.
3098
3099 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3100 cannot only rely on it: check also interp->fscodec_initialized for
3101 subinterpreters. */
3102 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003103 return PyUnicode_AsEncodedString(unicode,
3104 Py_FileSystemDefaultEncoding,
3105 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003106 }
3107 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003108 /* locale encoding with surrogateescape */
3109 wchar_t *wchar;
3110 char *bytes;
3111 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003112 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003113
3114 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3115 if (wchar == NULL)
3116 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003117 bytes = _Py_wchar2char(wchar, &error_pos);
3118 if (bytes == NULL) {
3119 if (error_pos != (size_t)-1) {
3120 char *errmsg = strerror(errno);
3121 PyObject *exc = NULL;
3122 if (errmsg == NULL)
3123 errmsg = "Py_wchar2char() failed";
3124 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003125 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003126 error_pos, error_pos+1,
3127 errmsg);
3128 Py_XDECREF(exc);
3129 }
3130 else
3131 PyErr_NoMemory();
3132 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003133 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003134 }
3135 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003136
3137 bytes_obj = PyBytes_FromString(bytes);
3138 PyMem_Free(bytes);
3139 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003140 }
Victor Stinnerad158722010-10-27 00:25:46 +00003141#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003142}
3143
Alexander Belopolsky40018472011-02-26 01:02:56 +00003144PyObject *
3145PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003146 const char *encoding,
3147 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148{
3149 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003150 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003151
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 if (!PyUnicode_Check(unicode)) {
3153 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 }
Fred Drakee4315f52000-05-09 19:53:39 +00003156
Fred Drakee4315f52000-05-09 19:53:39 +00003157 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003158 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003159 if ((strcmp(lower, "utf-8") == 0) ||
3160 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003161 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003162 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003164 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003165 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003166 }
Victor Stinner37296e82010-06-10 13:36:23 +00003167 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003168 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003169 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003170 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003171#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003172 else if (strcmp(lower, "mbcs") == 0)
3173 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003174#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003175 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003176 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178
3179 /* Encode via the codec registry */
3180 v = PyCodec_Encode(unicode, encoding, errors);
3181 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003182 return NULL;
3183
3184 /* The normal path */
3185 if (PyBytes_Check(v))
3186 return v;
3187
3188 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003190 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003191 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003192
3193 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3194 "encoder %s returned bytearray instead of bytes",
3195 encoding);
3196 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003197 Py_DECREF(v);
3198 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003200
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003201 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3202 Py_DECREF(v);
3203 return b;
3204 }
3205
3206 PyErr_Format(PyExc_TypeError,
3207 "encoder did not return a bytes object (type=%.400s)",
3208 Py_TYPE(v)->tp_name);
3209 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003210 return NULL;
3211}
3212
Alexander Belopolsky40018472011-02-26 01:02:56 +00003213PyObject *
3214PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003215 const char *encoding,
3216 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003217{
3218 PyObject *v;
3219
3220 if (!PyUnicode_Check(unicode)) {
3221 PyErr_BadArgument();
3222 goto onError;
3223 }
3224
3225 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003227
3228 /* Encode via the codec registry */
3229 v = PyCodec_Encode(unicode, encoding, errors);
3230 if (v == NULL)
3231 goto onError;
3232 if (!PyUnicode_Check(v)) {
3233 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003234 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235 Py_TYPE(v)->tp_name);
3236 Py_DECREF(v);
3237 goto onError;
3238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003240
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return NULL;
3243}
3244
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003245PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003246PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003248 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3249}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250
Christian Heimes5894ba72007-11-04 11:43:14 +00003251PyObject*
3252PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3253{
Victor Stinner99b95382011-07-04 14:23:54 +02003254#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003255 return PyUnicode_DecodeMBCS(s, size, NULL);
3256#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003257 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003258#else
Victor Stinner793b5312011-04-27 00:24:21 +02003259 PyInterpreterState *interp = PyThreadState_GET()->interp;
3260 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3261 cannot use it to encode and decode filenames before it is loaded. Load
3262 the Python codec requires to encode at least its own filename. Use the C
3263 version of the locale codec until the codec registry is initialized and
3264 the Python codec is loaded.
3265
3266 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3267 cannot only rely on it: check also interp->fscodec_initialized for
3268 subinterpreters. */
3269 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003270 return PyUnicode_Decode(s, size,
3271 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003272 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003273 }
3274 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003275 /* locale encoding with surrogateescape */
3276 wchar_t *wchar;
3277 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003278 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003279
3280 if (s[size] != '\0' || size != strlen(s)) {
3281 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3282 return NULL;
3283 }
3284
Victor Stinner168e1172010-10-16 23:16:16 +00003285 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003286 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003287 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003288
Victor Stinner168e1172010-10-16 23:16:16 +00003289 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003290 PyMem_Free(wchar);
3291 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003292 }
Victor Stinnerad158722010-10-27 00:25:46 +00003293#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003294}
3295
Martin v. Löwis011e8422009-05-05 04:43:17 +00003296
3297int
3298PyUnicode_FSConverter(PyObject* arg, void* addr)
3299{
3300 PyObject *output = NULL;
3301 Py_ssize_t size;
3302 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003303 if (arg == NULL) {
3304 Py_DECREF(*(PyObject**)addr);
3305 return 1;
3306 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003307 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003308 output = arg;
3309 Py_INCREF(output);
3310 }
3311 else {
3312 arg = PyUnicode_FromObject(arg);
3313 if (!arg)
3314 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003315 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003316 Py_DECREF(arg);
3317 if (!output)
3318 return 0;
3319 if (!PyBytes_Check(output)) {
3320 Py_DECREF(output);
3321 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3322 return 0;
3323 }
3324 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003325 size = PyBytes_GET_SIZE(output);
3326 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003327 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003328 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003329 Py_DECREF(output);
3330 return 0;
3331 }
3332 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003333 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003334}
3335
3336
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003337int
3338PyUnicode_FSDecoder(PyObject* arg, void* addr)
3339{
3340 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003341 if (arg == NULL) {
3342 Py_DECREF(*(PyObject**)addr);
3343 return 1;
3344 }
3345 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 if (PyUnicode_READY(arg))
3347 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003348 output = arg;
3349 Py_INCREF(output);
3350 }
3351 else {
3352 arg = PyBytes_FromObject(arg);
3353 if (!arg)
3354 return 0;
3355 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3356 PyBytes_GET_SIZE(arg));
3357 Py_DECREF(arg);
3358 if (!output)
3359 return 0;
3360 if (!PyUnicode_Check(output)) {
3361 Py_DECREF(output);
3362 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3363 return 0;
3364 }
3365 }
Victor Stinner065836e2011-10-27 01:56:33 +02003366 if (PyUnicode_READY(output) < 0) {
3367 Py_DECREF(output);
3368 return 0;
3369 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003370 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003371 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003372 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3373 Py_DECREF(output);
3374 return 0;
3375 }
3376 *(PyObject**)addr = output;
3377 return Py_CLEANUP_SUPPORTED;
3378}
3379
3380
Martin v. Löwis5b222132007-06-10 09:51:05 +00003381char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003382PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003383{
Christian Heimesf3863112007-11-22 07:46:41 +00003384 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003386 if (!PyUnicode_Check(unicode)) {
3387 PyErr_BadArgument();
3388 return NULL;
3389 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003390 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003391 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003393 if (PyUnicode_UTF8(unicode) == NULL) {
3394 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3396 if (bytes == NULL)
3397 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003398 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3399 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 Py_DECREF(bytes);
3401 return NULL;
3402 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003403 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3404 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3405 PyBytes_AS_STRING(bytes),
3406 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003407 Py_DECREF(bytes);
3408 }
3409
3410 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003411 *psize = PyUnicode_UTF8_LENGTH(unicode);
3412 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003413}
3414
3415char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003416PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003418 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3419}
3420
3421#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003422static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423#endif
3424
3425
3426Py_UNICODE *
3427PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429 const unsigned char *one_byte;
3430#if SIZEOF_WCHAR_T == 4
3431 const Py_UCS2 *two_bytes;
3432#else
3433 const Py_UCS4 *four_bytes;
3434 const Py_UCS4 *ucs4_end;
3435 Py_ssize_t num_surrogates;
3436#endif
3437 wchar_t *w;
3438 wchar_t *wchar_end;
3439
3440 if (!PyUnicode_Check(unicode)) {
3441 PyErr_BadArgument();
3442 return NULL;
3443 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003444 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003445 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003446 assert(_PyUnicode_KIND(unicode) != 0);
3447 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448
3449#ifdef Py_DEBUG
3450 ++unicode_as_unicode_calls;
3451#endif
3452
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003453 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003454#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003455 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3456 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457 num_surrogates = 0;
3458
3459 for (; four_bytes < ucs4_end; ++four_bytes) {
3460 if (*four_bytes > 0xFFFF)
3461 ++num_surrogates;
3462 }
3463
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003464 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3465 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3466 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 PyErr_NoMemory();
3468 return NULL;
3469 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003470 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 w = _PyUnicode_WSTR(unicode);
3473 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3474 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3476 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003477 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003478 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003479 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3480 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003481 }
3482 else
3483 *w = *four_bytes;
3484
3485 if (w > wchar_end) {
3486 assert(0 && "Miscalculated string end");
3487 }
3488 }
3489 *w = 0;
3490#else
3491 /* sizeof(wchar_t) == 4 */
3492 Py_FatalError("Impossible unicode object state, wstr and str "
3493 "should share memory already.");
3494 return NULL;
3495#endif
3496 }
3497 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003498 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3499 (_PyUnicode_LENGTH(unicode) + 1));
3500 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501 PyErr_NoMemory();
3502 return NULL;
3503 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003504 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3505 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3506 w = _PyUnicode_WSTR(unicode);
3507 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003509 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3510 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003511 for (; w < wchar_end; ++one_byte, ++w)
3512 *w = *one_byte;
3513 /* null-terminate the wstr */
3514 *w = 0;
3515 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003516 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003518 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003519 for (; w < wchar_end; ++two_bytes, ++w)
3520 *w = *two_bytes;
3521 /* null-terminate the wstr */
3522 *w = 0;
3523#else
3524 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003525 PyObject_FREE(_PyUnicode_WSTR(unicode));
3526 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003527 Py_FatalError("Impossible unicode object state, wstr "
3528 "and str should share memory already.");
3529 return NULL;
3530#endif
3531 }
3532 else {
3533 assert(0 && "This should never happen.");
3534 }
3535 }
3536 }
3537 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003538 *size = PyUnicode_WSTR_LENGTH(unicode);
3539 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003540}
3541
Alexander Belopolsky40018472011-02-26 01:02:56 +00003542Py_UNICODE *
3543PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546}
3547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549Py_ssize_t
3550PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
3552 if (!PyUnicode_Check(unicode)) {
3553 PyErr_BadArgument();
3554 goto onError;
3555 }
3556 return PyUnicode_GET_SIZE(unicode);
3557
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 return -1;
3560}
3561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003562Py_ssize_t
3563PyUnicode_GetLength(PyObject *unicode)
3564{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003565 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566 PyErr_BadArgument();
3567 return -1;
3568 }
3569
3570 return PyUnicode_GET_LENGTH(unicode);
3571}
3572
3573Py_UCS4
3574PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3575{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003576 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3577 PyErr_BadArgument();
3578 return (Py_UCS4)-1;
3579 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003580 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003581 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003582 return (Py_UCS4)-1;
3583 }
3584 return PyUnicode_READ_CHAR(unicode, index);
3585}
3586
3587int
3588PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3589{
3590 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003591 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003592 return -1;
3593 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003594 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003595 PyErr_SetString(PyExc_IndexError, "string index out of range");
3596 return -1;
3597 }
3598 if (_PyUnicode_Dirty(unicode))
3599 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003600 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3601 index, ch);
3602 return 0;
3603}
3604
Alexander Belopolsky40018472011-02-26 01:02:56 +00003605const char *
3606PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003607{
Victor Stinner42cb4622010-09-01 19:39:01 +00003608 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003609}
3610
Victor Stinner554f3f02010-06-16 23:33:54 +00003611/* create or adjust a UnicodeDecodeError */
3612static void
3613make_decode_exception(PyObject **exceptionObject,
3614 const char *encoding,
3615 const char *input, Py_ssize_t length,
3616 Py_ssize_t startpos, Py_ssize_t endpos,
3617 const char *reason)
3618{
3619 if (*exceptionObject == NULL) {
3620 *exceptionObject = PyUnicodeDecodeError_Create(
3621 encoding, input, length, startpos, endpos, reason);
3622 }
3623 else {
3624 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3625 goto onError;
3626 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3627 goto onError;
3628 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3629 goto onError;
3630 }
3631 return;
3632
3633onError:
3634 Py_DECREF(*exceptionObject);
3635 *exceptionObject = NULL;
3636}
3637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638/* error handling callback helper:
3639 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003640 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641 and adjust various state variables.
3642 return 0 on success, -1 on error
3643*/
3644
Alexander Belopolsky40018472011-02-26 01:02:56 +00003645static int
3646unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003647 const char *encoding, const char *reason,
3648 const char **input, const char **inend, Py_ssize_t *startinpos,
3649 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003650 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003652 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653
3654 PyObject *restuple = NULL;
3655 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003656 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003657 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003658 Py_ssize_t requiredsize;
3659 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003660 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 int res = -1;
3662
Victor Stinner596a6c42011-11-09 00:02:18 +01003663 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3664 outsize = PyUnicode_GET_LENGTH(*output);
3665 else
3666 outsize = _PyUnicode_WSTR_LENGTH(*output);
3667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 *errorHandler = PyCodec_LookupError(errors);
3670 if (*errorHandler == NULL)
3671 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 }
3673
Victor Stinner554f3f02010-06-16 23:33:54 +00003674 make_decode_exception(exceptionObject,
3675 encoding,
3676 *input, *inend - *input,
3677 *startinpos, *endinpos,
3678 reason);
3679 if (*exceptionObject == NULL)
3680 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681
3682 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3683 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003686 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 }
3689 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003691 if (PyUnicode_READY(repunicode) < 0)
3692 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003693
3694 /* Copy back the bytes variables, which might have been modified by the
3695 callback */
3696 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3697 if (!inputobj)
3698 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003699 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003700 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003701 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003702 *input = PyBytes_AS_STRING(inputobj);
3703 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003704 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003705 /* we can DECREF safely, as the exception has another reference,
3706 so the object won't go away. */
3707 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003709 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003711 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3713 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715
Victor Stinner596a6c42011-11-09 00:02:18 +01003716 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3717 /* need more space? (at least enough for what we
3718 have+the replacement+the rest of the string (starting
3719 at the new input position), so we won't have to check space
3720 when there are no errors in the rest of the string) */
3721 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3722 requiredsize = *outpos + replen + insize-newpos;
3723 if (requiredsize > outsize) {
3724 if (requiredsize<2*outsize)
3725 requiredsize = 2*outsize;
3726 if (unicode_resize(output, requiredsize) < 0)
3727 goto onError;
3728 }
3729 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003730 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003731 copy_characters(*output, *outpos, repunicode, 0, replen);
3732 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003734 else {
3735 wchar_t *repwstr;
3736 Py_ssize_t repwlen;
3737 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3738 if (repwstr == NULL)
3739 goto onError;
3740 /* need more space? (at least enough for what we
3741 have+the replacement+the rest of the string (starting
3742 at the new input position), so we won't have to check space
3743 when there are no errors in the rest of the string) */
3744 requiredsize = *outpos + repwlen + insize-newpos;
3745 if (requiredsize > outsize) {
3746 if (requiredsize < 2*outsize)
3747 requiredsize = 2*outsize;
3748 if (unicode_resize(output, requiredsize) < 0)
3749 goto onError;
3750 }
3751 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3752 *outpos += repwlen;
3753 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003755 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003756
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 /* we made it! */
3758 res = 0;
3759
Benjamin Peterson29060642009-01-31 22:14:21 +00003760 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 Py_XDECREF(restuple);
3762 return res;
3763}
3764
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765/* --- UTF-7 Codec -------------------------------------------------------- */
3766
Antoine Pitrou244651a2009-05-04 18:56:13 +00003767/* See RFC2152 for details. We encode conservatively and decode liberally. */
3768
3769/* Three simple macros defining base-64. */
3770
3771/* Is c a base-64 character? */
3772
3773#define IS_BASE64(c) \
3774 (((c) >= 'A' && (c) <= 'Z') || \
3775 ((c) >= 'a' && (c) <= 'z') || \
3776 ((c) >= '0' && (c) <= '9') || \
3777 (c) == '+' || (c) == '/')
3778
3779/* given that c is a base-64 character, what is its base-64 value? */
3780
3781#define FROM_BASE64(c) \
3782 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3783 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3784 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3785 (c) == '+' ? 62 : 63)
3786
3787/* What is the base-64 character of the bottom 6 bits of n? */
3788
3789#define TO_BASE64(n) \
3790 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3791
3792/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3793 * decoded as itself. We are permissive on decoding; the only ASCII
3794 * byte not decoding to itself is the + which begins a base64
3795 * string. */
3796
3797#define DECODE_DIRECT(c) \
3798 ((c) <= 127 && (c) != '+')
3799
3800/* The UTF-7 encoder treats ASCII characters differently according to
3801 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3802 * the above). See RFC2152. This array identifies these different
3803 * sets:
3804 * 0 : "Set D"
3805 * alphanumeric and '(),-./:?
3806 * 1 : "Set O"
3807 * !"#$%&*;<=>@[]^_`{|}
3808 * 2 : "whitespace"
3809 * ht nl cr sp
3810 * 3 : special (must be base64 encoded)
3811 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3812 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813
Tim Petersced69f82003-09-16 20:30:58 +00003814static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003815char utf7_category[128] = {
3816/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3817 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3818/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3819 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3820/* sp ! " # $ % & ' ( ) * + , - . / */
3821 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3822/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3824/* @ A B C D E F G H I J K L M N O */
3825 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3826/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3828/* ` a b c d e f g h i j k l m n o */
3829 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3830/* p q r s t u v w x y z { | } ~ del */
3831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003832};
3833
Antoine Pitrou244651a2009-05-04 18:56:13 +00003834/* ENCODE_DIRECT: this character should be encoded as itself. The
3835 * answer depends on whether we are encoding set O as itself, and also
3836 * on whether we are encoding whitespace as itself. RFC2152 makes it
3837 * clear that the answers to these questions vary between
3838 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003839
Antoine Pitrou244651a2009-05-04 18:56:13 +00003840#define ENCODE_DIRECT(c, directO, directWS) \
3841 ((c) < 128 && (c) > 0 && \
3842 ((utf7_category[(c)] == 0) || \
3843 (directWS && (utf7_category[(c)] == 2)) || \
3844 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003845
Alexander Belopolsky40018472011-02-26 01:02:56 +00003846PyObject *
3847PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003848 Py_ssize_t size,
3849 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003850{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003851 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3852}
3853
Antoine Pitrou244651a2009-05-04 18:56:13 +00003854/* The decoder. The only state we preserve is our read position,
3855 * i.e. how many characters we have consumed. So if we end in the
3856 * middle of a shift sequence we have to back off the read position
3857 * and the output to the beginning of the sequence, otherwise we lose
3858 * all the shift state (seen bits, number of bits seen, high
3859 * surrogate). */
3860
Alexander Belopolsky40018472011-02-26 01:02:56 +00003861PyObject *
3862PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003863 Py_ssize_t size,
3864 const char *errors,
3865 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003866{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003868 Py_ssize_t startinpos;
3869 Py_ssize_t endinpos;
3870 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003872 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003873 const char *errmsg = "";
3874 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003875 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 unsigned int base64bits = 0;
3877 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003878 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 PyObject *errorHandler = NULL;
3880 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003882 /* Start off assuming it's all ASCII. Widen later as necessary. */
3883 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884 if (!unicode)
3885 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003886 if (size == 0) {
3887 if (consumed)
3888 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003889 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003890 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003892 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 e = s + size;
3894
3895 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003896 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003897 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003898 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899
Antoine Pitrou244651a2009-05-04 18:56:13 +00003900 if (inShift) { /* in a base-64 section */
3901 if (IS_BASE64(ch)) { /* consume a base-64 character */
3902 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3903 base64bits += 6;
3904 s++;
3905 if (base64bits >= 16) {
3906 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003907 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 base64bits -= 16;
3909 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3910 if (surrogate) {
3911 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003912 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3913 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003914 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3915 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003917 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003918 }
3919 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003920 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3921 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003923 }
3924 }
Victor Stinner551ac952011-11-29 22:58:13 +01003925 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 /* first surrogate */
3927 surrogate = outCh;
3928 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003929 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003930 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3931 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003932 }
3933 }
3934 }
3935 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003936 inShift = 0;
3937 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003938 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003939 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3940 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003941 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003942 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003943 if (base64bits > 0) { /* left-over bits */
3944 if (base64bits >= 6) {
3945 /* We've seen at least one base-64 character */
3946 errmsg = "partial character in shift sequence";
3947 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003948 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003949 else {
3950 /* Some bits remain; they should be zero */
3951 if (base64buffer != 0) {
3952 errmsg = "non-zero padding bits in shift sequence";
3953 goto utf7Error;
3954 }
3955 }
3956 }
3957 if (ch != '-') {
3958 /* '-' is absorbed; other terminating
3959 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003960 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3961 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 }
3964 }
3965 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003967 s++; /* consume '+' */
3968 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003969 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003970 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3971 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972 }
3973 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003975 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 }
3978 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003979 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003980 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3981 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 s++;
3983 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 else {
3985 startinpos = s-starts;
3986 s++;
3987 errmsg = "unexpected special character";
3988 goto utf7Error;
3989 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 endinpos = s-starts;
3993 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 errors, &errorHandler,
3995 "utf7", errmsg,
3996 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003997 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003999 }
4000
Antoine Pitrou244651a2009-05-04 18:56:13 +00004001 /* end of string */
4002
4003 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4004 /* if we're in an inconsistent state, that's an error */
4005 if (surrogate ||
4006 (base64bits >= 6) ||
4007 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004008 endinpos = size;
4009 if (unicode_decode_call_errorhandler(
4010 errors, &errorHandler,
4011 "utf7", "unterminated shift sequence",
4012 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004013 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004014 goto onError;
4015 if (s < e)
4016 goto restart;
4017 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004018 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004019
4020 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004021 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004022 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004023 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004024 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004025 }
4026 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004028 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004031 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004032 goto onError;
4033
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034 Py_XDECREF(errorHandler);
4035 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004036 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 Py_XDECREF(errorHandler);
4040 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004041 Py_DECREF(unicode);
4042 return NULL;
4043}
4044
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004047_PyUnicode_EncodeUTF7(PyObject *str,
4048 int base64SetO,
4049 int base64WhiteSpace,
4050 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004051{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004052 int kind;
4053 void *data;
4054 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004055 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004057 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004058 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004059 unsigned int base64bits = 0;
4060 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061 char * out;
4062 char * start;
4063
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 if (PyUnicode_READY(str) < 0)
4065 return NULL;
4066 kind = PyUnicode_KIND(str);
4067 data = PyUnicode_DATA(str);
4068 len = PyUnicode_GET_LENGTH(str);
4069
4070 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004071 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004073 /* It might be possible to tighten this worst case */
4074 allocated = 8 * len;
4075 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004076 return PyErr_NoMemory();
4077
Antoine Pitrou244651a2009-05-04 18:56:13 +00004078 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004079 if (v == NULL)
4080 return NULL;
4081
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004082 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004083 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004084 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 if (inShift) {
4087 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4088 /* shifting out */
4089 if (base64bits) { /* output remaining bits */
4090 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4091 base64buffer = 0;
4092 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093 }
4094 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004095 /* Characters not in the BASE64 set implicitly unshift the sequence
4096 so no '-' is required, except if the character is itself a '-' */
4097 if (IS_BASE64(ch) || ch == '-') {
4098 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004099 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004100 *out++ = (char) ch;
4101 }
4102 else {
4103 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004104 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004105 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004106 else { /* not in a shift sequence */
4107 if (ch == '+') {
4108 *out++ = '+';
4109 *out++ = '-';
4110 }
4111 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4112 *out++ = (char) ch;
4113 }
4114 else {
4115 *out++ = '+';
4116 inShift = 1;
4117 goto encode_char;
4118 }
4119 }
4120 continue;
4121encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004122 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004123 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004124
Antoine Pitrou244651a2009-05-04 18:56:13 +00004125 /* code first surrogate */
4126 base64bits += 16;
4127 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4128 while (base64bits >= 6) {
4129 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4130 base64bits -= 6;
4131 }
4132 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004133 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004134 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135 base64bits += 16;
4136 base64buffer = (base64buffer << 16) | ch;
4137 while (base64bits >= 6) {
4138 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4139 base64bits -= 6;
4140 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004141 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004142 if (base64bits)
4143 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4144 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004145 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004146 if (_PyBytes_Resize(&v, out - start) < 0)
4147 return NULL;
4148 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004150PyObject *
4151PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4152 Py_ssize_t size,
4153 int base64SetO,
4154 int base64WhiteSpace,
4155 const char *errors)
4156{
4157 PyObject *result;
4158 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4159 if (tmp == NULL)
4160 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004161 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004162 base64WhiteSpace, errors);
4163 Py_DECREF(tmp);
4164 return result;
4165}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166
Antoine Pitrou244651a2009-05-04 18:56:13 +00004167#undef IS_BASE64
4168#undef FROM_BASE64
4169#undef TO_BASE64
4170#undef DECODE_DIRECT
4171#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173/* --- UTF-8 Codec -------------------------------------------------------- */
4174
Tim Petersced69f82003-09-16 20:30:58 +00004175static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004177 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4178 illegal prefix. See RFC 3629 for details */
4179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4191 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4192 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4194 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195};
4196
Alexander Belopolsky40018472011-02-26 01:02:56 +00004197PyObject *
4198PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004199 Py_ssize_t size,
4200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201{
Walter Dörwald69652032004-09-07 20:24:22 +00004202 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4203}
4204
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004205#include "stringlib/ucs1lib.h"
4206#include "stringlib/codecs.h"
4207#include "stringlib/undef.h"
4208
4209#include "stringlib/ucs2lib.h"
4210#include "stringlib/codecs.h"
4211#include "stringlib/undef.h"
4212
4213#include "stringlib/ucs4lib.h"
4214#include "stringlib/codecs.h"
4215#include "stringlib/undef.h"
4216
Antoine Pitrouab868312009-01-10 15:40:25 +00004217/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4218#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4219
4220/* Mask to quickly check whether a C 'long' contains a
4221 non-ASCII, UTF8-encoded char. */
4222#if (SIZEOF_LONG == 8)
4223# define ASCII_CHAR_MASK 0x8080808080808080L
4224#elif (SIZEOF_LONG == 4)
4225# define ASCII_CHAR_MASK 0x80808080L
4226#else
4227# error C 'long' size should be either 4 or 8!
4228#endif
4229
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004230/* Scans a UTF-8 string and returns the maximum character to be expected
4231 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004233 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004234 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235 */
4236static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004237utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 const unsigned char *end = p + string_size;
4241 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243 assert(unicode_size != NULL);
4244
4245 /* By having a cascade of independent loops which fallback onto each
4246 other, we minimize the amount of work done in the average loop
4247 iteration, and we also maximize the CPU's ability to predict
4248 branches correctly (because a given condition will have always the
4249 same boolean outcome except perhaps in the last iteration of the
4250 corresponding loop).
4251 In the general case this brings us rather close to decoding
4252 performance pre-PEP 393, despite the two-pass decoding.
4253
4254 Note that the pure ASCII loop is not duplicated once a non-ASCII
4255 character has been encountered. It is actually a pessimization (by
4256 a significant factor) to use this loop on text with many non-ASCII
4257 characters, and it is important to avoid bad performance on valid
4258 utf-8 data (invalid utf-8 being a different can of worms).
4259 */
4260
4261 /* ASCII */
4262 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* Only check value if it's not a ASCII char... */
4264 if (*p < 0x80) {
4265 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4266 an explanation. */
4267 if (!((size_t) p & LONG_PTR_MASK)) {
4268 /* Help register allocation */
4269 register const unsigned char *_p = p;
4270 while (_p < aligned_end) {
4271 unsigned long value = *(unsigned long *) _p;
4272 if (value & ASCII_CHAR_MASK)
4273 break;
4274 _p += SIZEOF_LONG;
4275 char_count += SIZEOF_LONG;
4276 }
4277 p = _p;
4278 if (p == end)
4279 break;
4280 }
4281 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004282 if (*p < 0x80)
4283 ++char_count;
4284 else
4285 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 *unicode_size = char_count;
4288 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290_ucs1loop:
4291 for (; p < end; ++p) {
4292 if (*p < 0xc4)
4293 char_count += ((*p & 0xc0) != 0x80);
4294 else
4295 goto _ucs2loop;
4296 }
4297 *unicode_size = char_count;
4298 return 255;
4299
4300_ucs2loop:
4301 for (; p < end; ++p) {
4302 if (*p < 0xf0)
4303 char_count += ((*p & 0xc0) != 0x80);
4304 else
4305 goto _ucs4loop;
4306 }
4307 *unicode_size = char_count;
4308 return 65535;
4309
4310_ucs4loop:
4311 for (; p < end; ++p) {
4312 char_count += ((*p & 0xc0) != 0x80);
4313 }
4314 *unicode_size = char_count;
4315 return 65537;
4316}
4317
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004318/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004319 in case of errors. Implicit parameters: unicode, kind, data, onError.
4320 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004321*/
Victor Stinner785938e2011-12-11 20:09:03 +01004322#define WRITE_MAYBE_FAIL(index, value) \
4323 do { \
4324 Py_ssize_t pos = index; \
4325 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4326 unicode_resize(&unicode, pos + pos/8) < 0) \
4327 goto onError; \
4328 if (unicode_putchar(&unicode, &pos, value) < 0) \
4329 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330 } while (0)
4331
Alexander Belopolsky40018472011-02-26 01:02:56 +00004332PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004333decode_utf8_errors(const char *starts,
4334 Py_ssize_t size,
4335 const char *errors,
4336 Py_ssize_t *consumed,
4337 const char *s,
4338 PyObject *unicode,
4339 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004340{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004342 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343 Py_ssize_t startinpos;
4344 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004345 const char *e = starts + size;
4346 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004347 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 PyObject *errorHandler = NULL;
4349 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004350
Antoine Pitrouab868312009-01-10 15:40:25 +00004351 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352
4353 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004354 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355
4356 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004357 /* Fast path for runs of ASCII characters. Given that common UTF-8
4358 input will consist of an overwhelming majority of ASCII
4359 characters, we try to optimize for this case by checking
4360 as many characters as a C 'long' can contain.
4361 First, check if we can do an aligned read, as most CPUs have
4362 a penalty for unaligned reads.
4363 */
4364 if (!((size_t) s & LONG_PTR_MASK)) {
4365 /* Help register allocation */
4366 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004367 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004368 while (_s < aligned_end) {
4369 /* Read a whole long at a time (either 4 or 8 bytes),
4370 and do a fast unrolled copy if it only contains ASCII
4371 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372 unsigned long value = *(unsigned long *) _s;
4373 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004374 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4376 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4377 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4378 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004379#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004380 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4381 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4382 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4383 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004384#endif
4385 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004386 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004387 }
4388 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004389 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004390 if (s == e)
4391 break;
4392 ch = (unsigned char)*s;
4393 }
4394 }
4395
4396 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004397 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 s++;
4399 continue;
4400 }
4401
4402 n = utf8_code_length[ch];
4403
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004404 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 if (consumed)
4406 break;
4407 else {
4408 errmsg = "unexpected end of data";
4409 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004410 endinpos = startinpos+1;
4411 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4412 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 goto utf8Error;
4414 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416
4417 switch (n) {
4418
4419 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004420 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 startinpos = s-starts;
4422 endinpos = startinpos+1;
4423 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424
4425 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004426 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 startinpos = s-starts;
4428 endinpos = startinpos+1;
4429 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430
4431 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004432 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004433 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004435 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 goto utf8Error;
4437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004439 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 break;
4442
4443 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004444 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4445 will result in surrogates in range d800-dfff. Surrogates are
4446 not valid UTF-8 so they are rejected.
4447 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4448 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004449 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004450 (s[2] & 0xc0) != 0x80 ||
4451 ((unsigned char)s[0] == 0xE0 &&
4452 (unsigned char)s[1] < 0xA0) ||
4453 ((unsigned char)s[0] == 0xED &&
4454 (unsigned char)s[1] > 0x9F)) {
4455 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004457 endinpos = startinpos + 1;
4458
4459 /* if s[1] first two bits are 1 and 0, then the invalid
4460 continuation byte is s[2], so increment endinpos by 1,
4461 if not, s[1] is invalid and endinpos doesn't need to
4462 be incremented. */
4463 if ((s[1] & 0xC0) == 0x80)
4464 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 goto utf8Error;
4466 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004468 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004469 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004470 break;
4471
4472 case 4:
4473 if ((s[1] & 0xc0) != 0x80 ||
4474 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004475 (s[3] & 0xc0) != 0x80 ||
4476 ((unsigned char)s[0] == 0xF0 &&
4477 (unsigned char)s[1] < 0x90) ||
4478 ((unsigned char)s[0] == 0xF4 &&
4479 (unsigned char)s[1] > 0x8F)) {
4480 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004482 endinpos = startinpos + 1;
4483 if ((s[1] & 0xC0) == 0x80) {
4484 endinpos++;
4485 if ((s[2] & 0xC0) == 0x80)
4486 endinpos++;
4487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 goto utf8Error;
4489 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004490 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004491 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004492 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004493
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004494 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 }
4497 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004499
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 if (unicode_decode_call_errorhandler(
4502 errors, &errorHandler,
4503 "utf8", errmsg,
4504 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004505 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004507 /* Update data because unicode_decode_call_errorhandler might have
4508 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 }
Walter Dörwald69652032004-09-07 20:24:22 +00004511 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 /* Adjust length and ready string when it contained errors and
4515 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004516 if (unicode_resize(&unicode, i) < 0)
4517 goto onError;
4518 unicode_adjust_maxchar(&unicode);
4519 if (unicode == NULL)
4520 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 Py_XDECREF(errorHandler);
4523 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004524 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004525 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 Py_XDECREF(errorHandler);
4529 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004530 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 return NULL;
4532}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004533#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004534
Victor Stinner785938e2011-12-11 20:09:03 +01004535PyObject *
4536PyUnicode_DecodeUTF8Stateful(const char *s,
4537 Py_ssize_t size,
4538 const char *errors,
4539 Py_ssize_t *consumed)
4540{
4541 Py_UCS4 maxchar = 0;
4542 Py_ssize_t unicode_size;
4543 int has_errors = 0;
4544 PyObject *unicode;
4545 int kind;
4546 void *data;
4547 const char *starts = s;
4548 const char *e;
4549 Py_ssize_t i;
4550
4551 if (size == 0) {
4552 if (consumed)
4553 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004554 Py_INCREF(unicode_empty);
4555 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004556 }
4557
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004558 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004559
4560 /* When the string is ASCII only, just use memcpy and return.
4561 unicode_size may be != size if there is an incomplete UTF-8
4562 sequence at the end of the ASCII block. */
4563 if (maxchar < 128 && size == unicode_size) {
4564 if (consumed)
4565 *consumed = size;
4566 return unicode_fromascii(s, size);
4567 }
4568
4569 unicode = PyUnicode_New(unicode_size, maxchar);
4570 if (!unicode)
4571 return NULL;
4572 kind = PyUnicode_KIND(unicode);
4573 data = PyUnicode_DATA(unicode);
4574
4575 /* Unpack UTF-8 encoded data */
4576 i = 0;
4577 e = starts + size;
4578 switch (kind) {
4579 case PyUnicode_1BYTE_KIND:
4580 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4581 break;
4582 case PyUnicode_2BYTE_KIND:
4583 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4584 break;
4585 case PyUnicode_4BYTE_KIND:
4586 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4587 break;
4588 }
4589 if (!has_errors) {
4590 /* Ensure the unicode size calculation was correct */
4591 assert(i == unicode_size);
4592 assert(s == e);
4593 if (consumed)
4594 *consumed = size;
4595 return unicode;
4596 }
4597
4598 /* In case of errors, maxchar and size computation might be incorrect;
4599 code below refits and resizes as necessary. */
4600 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4601}
4602
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004603#ifdef __APPLE__
4604
4605/* Simplified UTF-8 decoder using surrogateescape error handler,
4606 used to decode the command line arguments on Mac OS X. */
4607
4608wchar_t*
4609_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4610{
4611 int n;
4612 const char *e;
4613 wchar_t *unicode, *p;
4614
4615 /* Note: size will always be longer than the resulting Unicode
4616 character count */
4617 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4618 PyErr_NoMemory();
4619 return NULL;
4620 }
4621 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4622 if (!unicode)
4623 return NULL;
4624
4625 /* Unpack UTF-8 encoded data */
4626 p = unicode;
4627 e = s + size;
4628 while (s < e) {
4629 Py_UCS4 ch = (unsigned char)*s;
4630
4631 if (ch < 0x80) {
4632 *p++ = (wchar_t)ch;
4633 s++;
4634 continue;
4635 }
4636
4637 n = utf8_code_length[ch];
4638 if (s + n > e) {
4639 goto surrogateescape;
4640 }
4641
4642 switch (n) {
4643 case 0:
4644 case 1:
4645 goto surrogateescape;
4646
4647 case 2:
4648 if ((s[1] & 0xc0) != 0x80)
4649 goto surrogateescape;
4650 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4651 assert ((ch > 0x007F) && (ch <= 0x07FF));
4652 *p++ = (wchar_t)ch;
4653 break;
4654
4655 case 3:
4656 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4657 will result in surrogates in range d800-dfff. Surrogates are
4658 not valid UTF-8 so they are rejected.
4659 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4660 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4661 if ((s[1] & 0xc0) != 0x80 ||
4662 (s[2] & 0xc0) != 0x80 ||
4663 ((unsigned char)s[0] == 0xE0 &&
4664 (unsigned char)s[1] < 0xA0) ||
4665 ((unsigned char)s[0] == 0xED &&
4666 (unsigned char)s[1] > 0x9F)) {
4667
4668 goto surrogateescape;
4669 }
4670 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4671 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004672 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004673 break;
4674
4675 case 4:
4676 if ((s[1] & 0xc0) != 0x80 ||
4677 (s[2] & 0xc0) != 0x80 ||
4678 (s[3] & 0xc0) != 0x80 ||
4679 ((unsigned char)s[0] == 0xF0 &&
4680 (unsigned char)s[1] < 0x90) ||
4681 ((unsigned char)s[0] == 0xF4 &&
4682 (unsigned char)s[1] > 0x8F)) {
4683 goto surrogateescape;
4684 }
4685 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4686 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004687 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688
4689#if SIZEOF_WCHAR_T == 4
4690 *p++ = (wchar_t)ch;
4691#else
4692 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004693 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4694 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004695#endif
4696 break;
4697 }
4698 s += n;
4699 continue;
4700
4701 surrogateescape:
4702 *p++ = 0xDC00 + ch;
4703 s++;
4704 }
4705 *p = L'\0';
4706 return unicode;
4707}
4708
4709#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004711/* Primary internal function which creates utf8 encoded bytes objects.
4712
4713 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004714 and allocate exactly as much space needed at the end. Else allocate the
4715 maximum possible needed (4 result bytes per Unicode character), and return
4716 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004717*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004718PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004719_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720{
Tim Peters602f7402002-04-27 18:03:26 +00004721#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004722
Guido van Rossum98297ee2007-11-06 21:34:58 +00004723 Py_ssize_t i; /* index into s of next input byte */
4724 PyObject *result; /* result string object */
4725 char *p; /* next free byte in output buffer */
4726 Py_ssize_t nallocated; /* number of result bytes allocated */
4727 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004728 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004729 PyObject *errorHandler = NULL;
4730 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 int kind;
4732 void *data;
4733 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004734 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004736 if (!PyUnicode_Check(unicode)) {
4737 PyErr_BadArgument();
4738 return NULL;
4739 }
4740
4741 if (PyUnicode_READY(unicode) == -1)
4742 return NULL;
4743
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004744 if (PyUnicode_UTF8(unicode))
4745 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4746 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747
4748 kind = PyUnicode_KIND(unicode);
4749 data = PyUnicode_DATA(unicode);
4750 size = PyUnicode_GET_LENGTH(unicode);
4751
Tim Peters602f7402002-04-27 18:03:26 +00004752 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Tim Peters602f7402002-04-27 18:03:26 +00004754 if (size <= MAX_SHORT_UNICHARS) {
4755 /* Write into the stack buffer; nallocated can't overflow.
4756 * At the end, we'll allocate exactly as much heap space as it
4757 * turns out we need.
4758 */
4759 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004761 p = stackbuf;
4762 }
4763 else {
4764 /* Overallocate on the heap, and give the excess back at the end. */
4765 nallocated = size * 4;
4766 if (nallocated / 4 != size) /* overflow! */
4767 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004768 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004769 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004770 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004771 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004772 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004773
Tim Peters602f7402002-04-27 18:03:26 +00004774 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004776
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004777 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004778 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004780
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004782 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004783 *p++ = (char)(0xc0 | (ch >> 6));
4784 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004785 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004787 Py_ssize_t repsize, k, startpos;
4788 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004789 rep = unicode_encode_call_errorhandler(
4790 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004791 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792 if (!rep)
4793 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 if (PyBytes_Check(rep))
4796 repsize = PyBytes_GET_SIZE(rep);
4797 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004798 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004799
4800 if (repsize > 4) {
4801 Py_ssize_t offset;
4802
4803 if (result == NULL)
4804 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004805 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004806 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4809 /* integer overflow */
4810 PyErr_NoMemory();
4811 goto error;
4812 }
4813 nallocated += repsize - 4;
4814 if (result != NULL) {
4815 if (_PyBytes_Resize(&result, nallocated) < 0)
4816 goto error;
4817 } else {
4818 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004819 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004820 goto error;
4821 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4822 }
4823 p = PyBytes_AS_STRING(result) + offset;
4824 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826 if (PyBytes_Check(rep)) {
4827 char *prep = PyBytes_AS_STRING(rep);
4828 for(k = repsize; k > 0; k--)
4829 *p++ = *prep++;
4830 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004831 enum PyUnicode_Kind repkind;
4832 void *repdata;
4833
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004834 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004835 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004836 repkind = PyUnicode_KIND(rep);
4837 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004838
4839 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004840 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004841 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004842 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004843 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004844 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004846 goto error;
4847 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004848 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004849 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004851 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004852 } else if (ch < 0x10000) {
4853 *p++ = (char)(0xe0 | (ch >> 12));
4854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4855 *p++ = (char)(0x80 | (ch & 0x3f));
4856 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004857 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004858 /* Encode UCS4 Unicode ordinals */
4859 *p++ = (char)(0xf0 | (ch >> 18));
4860 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4861 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4862 *p++ = (char)(0x80 | (ch & 0x3f));
4863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004865
Guido van Rossum98297ee2007-11-06 21:34:58 +00004866 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004867 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004868 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004869 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004870 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004871 }
4872 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004873 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004874 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004875 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004876 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004879 Py_XDECREF(errorHandler);
4880 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004881 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004882 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004883 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 Py_XDECREF(result);
4887 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004888
Tim Peters602f7402002-04-27 18:03:26 +00004889#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890}
4891
Alexander Belopolsky40018472011-02-26 01:02:56 +00004892PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004893PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4894 Py_ssize_t size,
4895 const char *errors)
4896{
4897 PyObject *v, *unicode;
4898
4899 unicode = PyUnicode_FromUnicode(s, size);
4900 if (unicode == NULL)
4901 return NULL;
4902 v = _PyUnicode_AsUTF8String(unicode, errors);
4903 Py_DECREF(unicode);
4904 return v;
4905}
4906
4907PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004908PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
4912
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913/* --- UTF-32 Codec ------------------------------------------------------- */
4914
4915PyObject *
4916PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 Py_ssize_t size,
4918 const char *errors,
4919 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920{
4921 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4922}
4923
4924PyObject *
4925PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 Py_ssize_t size,
4927 const char *errors,
4928 int *byteorder,
4929 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930{
4931 const char *starts = s;
4932 Py_ssize_t startinpos;
4933 Py_ssize_t endinpos;
4934 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004935 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004936 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 int bo = 0; /* assume native ordering by default */
4938 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939 /* Offsets from q for retrieving bytes in the right order. */
4940#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4941 int iorder[] = {0, 1, 2, 3};
4942#else
4943 int iorder[] = {3, 2, 1, 0};
4944#endif
4945 PyObject *errorHandler = NULL;
4946 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004947
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948 q = (unsigned char *)s;
4949 e = q + size;
4950
4951 if (byteorder)
4952 bo = *byteorder;
4953
4954 /* Check for BOM marks (U+FEFF) in the input and adjust current
4955 byte order setting accordingly. In native mode, the leading BOM
4956 mark is skipped, in all other modes, it is copied to the output
4957 stream as-is (giving a ZWNBSP character). */
4958 if (bo == 0) {
4959 if (size >= 4) {
4960 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 if (bom == 0x0000FEFF) {
4964 q += 4;
4965 bo = -1;
4966 }
4967 else if (bom == 0xFFFE0000) {
4968 q += 4;
4969 bo = 1;
4970 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004971#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 if (bom == 0x0000FEFF) {
4973 q += 4;
4974 bo = 1;
4975 }
4976 else if (bom == 0xFFFE0000) {
4977 q += 4;
4978 bo = -1;
4979 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 }
4983
4984 if (bo == -1) {
4985 /* force LE */
4986 iorder[0] = 0;
4987 iorder[1] = 1;
4988 iorder[2] = 2;
4989 iorder[3] = 3;
4990 }
4991 else if (bo == 1) {
4992 /* force BE */
4993 iorder[0] = 3;
4994 iorder[1] = 2;
4995 iorder[2] = 1;
4996 iorder[3] = 0;
4997 }
4998
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004999 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005000 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005001 if (!unicode)
5002 return NULL;
5003 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005004 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005005 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005006
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 Py_UCS4 ch;
5009 /* remaining bytes at the end? (size should be divisible by 4) */
5010 if (e-q<4) {
5011 if (consumed)
5012 break;
5013 errmsg = "truncated data";
5014 startinpos = ((const char *)q)-starts;
5015 endinpos = ((const char *)e)-starts;
5016 goto utf32Error;
5017 /* The remaining input chars are ignored if the callback
5018 chooses to skip the input */
5019 }
5020 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5021 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 if (ch >= 0x110000)
5024 {
5025 errmsg = "codepoint not in range(0x110000)";
5026 startinpos = ((const char *)q)-starts;
5027 endinpos = startinpos+4;
5028 goto utf32Error;
5029 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5031 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 q += 4;
5033 continue;
5034 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 if (unicode_decode_call_errorhandler(
5036 errors, &errorHandler,
5037 "utf32", errmsg,
5038 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005039 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 }
5042
5043 if (byteorder)
5044 *byteorder = bo;
5045
5046 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048
5049 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005050 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 goto onError;
5052
5053 Py_XDECREF(errorHandler);
5054 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005055 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058 Py_DECREF(unicode);
5059 Py_XDECREF(errorHandler);
5060 Py_XDECREF(exc);
5061 return NULL;
5062}
5063
5064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005065_PyUnicode_EncodeUTF32(PyObject *str,
5066 const char *errors,
5067 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005069 int kind;
5070 void *data;
5071 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005072 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005074 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 /* Offsets from p for storing byte pairs in the right order. */
5076#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5077 int iorder[] = {0, 1, 2, 3};
5078#else
5079 int iorder[] = {3, 2, 1, 0};
5080#endif
5081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082#define STORECHAR(CH) \
5083 do { \
5084 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5085 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5086 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5087 p[iorder[0]] = (CH) & 0xff; \
5088 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089 } while(0)
5090
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005091 if (!PyUnicode_Check(str)) {
5092 PyErr_BadArgument();
5093 return NULL;
5094 }
5095 if (PyUnicode_READY(str) < 0)
5096 return NULL;
5097 kind = PyUnicode_KIND(str);
5098 data = PyUnicode_DATA(str);
5099 len = PyUnicode_GET_LENGTH(str);
5100
5101 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005102 bytesize = nsize * 4;
5103 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005105 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 if (v == NULL)
5107 return NULL;
5108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005112 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005113 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114
5115 if (byteorder == -1) {
5116 /* force LE */
5117 iorder[0] = 0;
5118 iorder[1] = 1;
5119 iorder[2] = 2;
5120 iorder[3] = 3;
5121 }
5122 else if (byteorder == 1) {
5123 /* force BE */
5124 iorder[0] = 3;
5125 iorder[1] = 2;
5126 iorder[2] = 1;
5127 iorder[3] = 0;
5128 }
5129
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005130 for (i = 0; i < len; i++)
5131 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005132
5133 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005134 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135#undef STORECHAR
5136}
5137
Alexander Belopolsky40018472011-02-26 01:02:56 +00005138PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005139PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5140 Py_ssize_t size,
5141 const char *errors,
5142 int byteorder)
5143{
5144 PyObject *result;
5145 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5146 if (tmp == NULL)
5147 return NULL;
5148 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5149 Py_DECREF(tmp);
5150 return result;
5151}
5152
5153PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005154PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155{
Victor Stinnerb960b342011-11-20 19:12:52 +01005156 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157}
5158
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159/* --- UTF-16 Codec ------------------------------------------------------- */
5160
Tim Peters772747b2001-08-09 22:21:55 +00005161PyObject *
5162PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 Py_ssize_t size,
5164 const char *errors,
5165 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166{
Walter Dörwald69652032004-09-07 20:24:22 +00005167 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5168}
5169
Antoine Pitrouab868312009-01-10 15:40:25 +00005170/* Two masks for fast checking of whether a C 'long' may contain
5171 UTF16-encoded surrogate characters. This is an efficient heuristic,
5172 assuming that non-surrogate characters with a code point >= 0x8000 are
5173 rare in most input.
5174 FAST_CHAR_MASK is used when the input is in native byte ordering,
5175 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005176*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005177#if (SIZEOF_LONG == 8)
5178# define FAST_CHAR_MASK 0x8000800080008000L
5179# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5180#elif (SIZEOF_LONG == 4)
5181# define FAST_CHAR_MASK 0x80008000L
5182# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5183#else
5184# error C 'long' size should be either 4 or 8!
5185#endif
5186
Walter Dörwald69652032004-09-07 20:24:22 +00005187PyObject *
5188PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 Py_ssize_t size,
5190 const char *errors,
5191 int *byteorder,
5192 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005193{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005194 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005195 Py_ssize_t startinpos;
5196 Py_ssize_t endinpos;
5197 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005198 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005199 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005200 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005201 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005202 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005203 /* Offsets from q for retrieving byte pairs in the right order. */
5204#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5205 int ihi = 1, ilo = 0;
5206#else
5207 int ihi = 0, ilo = 1;
5208#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005209 PyObject *errorHandler = NULL;
5210 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211
5212 /* Note: size will always be longer than the resulting Unicode
5213 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005214 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 if (!unicode)
5216 return NULL;
5217 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005218 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005219 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
Tim Peters772747b2001-08-09 22:21:55 +00005221 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005222 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
5224 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005225 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227 /* Check for BOM marks (U+FEFF) in the input and adjust current
5228 byte order setting accordingly. In native mode, the leading BOM
5229 mark is skipped, in all other modes, it is copied to the output
5230 stream as-is (giving a ZWNBSP character). */
5231 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005232 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005233 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005234#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (bom == 0xFEFF) {
5236 q += 2;
5237 bo = -1;
5238 }
5239 else if (bom == 0xFFFE) {
5240 q += 2;
5241 bo = 1;
5242 }
Tim Petersced69f82003-09-16 20:30:58 +00005243#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if (bom == 0xFEFF) {
5245 q += 2;
5246 bo = 1;
5247 }
5248 else if (bom == 0xFFFE) {
5249 q += 2;
5250 bo = -1;
5251 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005252#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Tim Peters772747b2001-08-09 22:21:55 +00005256 if (bo == -1) {
5257 /* force LE */
5258 ihi = 1;
5259 ilo = 0;
5260 }
5261 else if (bo == 1) {
5262 /* force BE */
5263 ihi = 0;
5264 ilo = 1;
5265 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005266#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5267 native_ordering = ilo < ihi;
5268#else
5269 native_ordering = ilo > ihi;
5270#endif
Tim Peters772747b2001-08-09 22:21:55 +00005271
Antoine Pitrouab868312009-01-10 15:40:25 +00005272 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005273 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005274 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005275 /* First check for possible aligned read of a C 'long'. Unaligned
5276 reads are more expensive, better to defer to another iteration. */
5277 if (!((size_t) q & LONG_PTR_MASK)) {
5278 /* Fast path for runs of non-surrogate chars. */
5279 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005280 int kind = PyUnicode_KIND(unicode);
5281 void *data = PyUnicode_DATA(unicode);
5282 while (_q < aligned_end) {
5283 unsigned long block = * (unsigned long *) _q;
5284 unsigned short *pblock = (unsigned short*)&block;
5285 Py_UCS4 maxch;
5286 if (native_ordering) {
5287 /* Can use buffer directly */
5288 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005289 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005290 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005291 else {
5292 /* Need to byte-swap */
5293 unsigned char *_p = (unsigned char*)pblock;
5294 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005295 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005296 _p[0] = _q[1];
5297 _p[1] = _q[0];
5298 _p[2] = _q[3];
5299 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005300#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005301 _p[4] = _q[5];
5302 _p[5] = _q[4];
5303 _p[6] = _q[7];
5304 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005305#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005306 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005307 maxch = Py_MAX(pblock[0], pblock[1]);
5308#if SIZEOF_LONG == 8
5309 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5310#endif
5311 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5312 if (unicode_widen(&unicode, maxch) < 0)
5313 goto onError;
5314 kind = PyUnicode_KIND(unicode);
5315 data = PyUnicode_DATA(unicode);
5316 }
5317 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5318 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5319#if SIZEOF_LONG == 8
5320 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5321 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5322#endif
5323 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005325 q = _q;
5326 if (q >= e)
5327 break;
5328 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005332
Victor Stinner551ac952011-11-29 22:58:13 +01005333 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5335 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 continue;
5337 }
5338
5339 /* UTF-16 code pair: */
5340 if (q > e) {
5341 errmsg = "unexpected end of data";
5342 startinpos = (((const char *)q) - 2) - starts;
5343 endinpos = ((const char *)e) + 1 - starts;
5344 goto utf16Error;
5345 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005346 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5347 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005349 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005350 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005351 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005352 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 continue;
5354 }
5355 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005356 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 startinpos = (((const char *)q)-4)-starts;
5358 endinpos = startinpos+2;
5359 goto utf16Error;
5360 }
5361
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 errmsg = "illegal encoding";
5364 startinpos = (((const char *)q)-2)-starts;
5365 endinpos = startinpos+2;
5366 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005367
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005370 errors,
5371 &errorHandler,
5372 "utf16", errmsg,
5373 &starts,
5374 (const char **)&e,
5375 &startinpos,
5376 &endinpos,
5377 &exc,
5378 (const char **)&q,
5379 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005380 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005383 /* remaining byte at the end? (size should be even) */
5384 if (e == q) {
5385 if (!consumed) {
5386 errmsg = "truncated data";
5387 startinpos = ((const char *)q) - starts;
5388 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005389 if (unicode_decode_call_errorhandler(
5390 errors,
5391 &errorHandler,
5392 "utf16", errmsg,
5393 &starts,
5394 (const char **)&e,
5395 &startinpos,
5396 &endinpos,
5397 &exc,
5398 (const char **)&q,
5399 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005400 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005401 goto onError;
5402 /* The remaining input chars are ignored if the callback
5403 chooses to skip the input */
5404 }
5405 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406
5407 if (byteorder)
5408 *byteorder = bo;
5409
Walter Dörwald69652032004-09-07 20:24:22 +00005410 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005412
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005414 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 goto onError;
5416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 Py_XDECREF(errorHandler);
5418 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005419 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005423 Py_XDECREF(errorHandler);
5424 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 return NULL;
5426}
5427
Antoine Pitrouab868312009-01-10 15:40:25 +00005428#undef FAST_CHAR_MASK
5429#undef SWAPPED_FAST_CHAR_MASK
5430
Tim Peters772747b2001-08-09 22:21:55 +00005431PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005432_PyUnicode_EncodeUTF16(PyObject *str,
5433 const char *errors,
5434 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436 int kind;
5437 void *data;
5438 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005439 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005440 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005441 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005442 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005443 /* Offsets from p for storing byte pairs in the right order. */
5444#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5445 int ihi = 1, ilo = 0;
5446#else
5447 int ihi = 0, ilo = 1;
5448#endif
5449
Benjamin Peterson29060642009-01-31 22:14:21 +00005450#define STORECHAR(CH) \
5451 do { \
5452 p[ihi] = ((CH) >> 8) & 0xff; \
5453 p[ilo] = (CH) & 0xff; \
5454 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005455 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005457 if (!PyUnicode_Check(str)) {
5458 PyErr_BadArgument();
5459 return NULL;
5460 }
5461 if (PyUnicode_READY(str) < 0)
5462 return NULL;
5463 kind = PyUnicode_KIND(str);
5464 data = PyUnicode_DATA(str);
5465 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005466
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005467 pairs = 0;
5468 if (kind == PyUnicode_4BYTE_KIND)
5469 for (i = 0; i < len; i++)
5470 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5471 pairs++;
5472 /* 2 * (len + pairs + (byteorder == 0)) */
5473 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 bytesize = nsize * 2;
5477 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005478 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005479 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 if (v == NULL)
5481 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005486 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005487 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005488
5489 if (byteorder == -1) {
5490 /* force LE */
5491 ihi = 1;
5492 ilo = 0;
5493 }
5494 else if (byteorder == 1) {
5495 /* force BE */
5496 ihi = 0;
5497 ilo = 1;
5498 }
5499
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005500 for (i = 0; i < len; i++) {
5501 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5502 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005504 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5505 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 }
Tim Peters772747b2001-08-09 22:21:55 +00005507 STORECHAR(ch);
5508 if (ch2)
5509 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005510 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005511
5512 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005513 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005514#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515}
5516
Alexander Belopolsky40018472011-02-26 01:02:56 +00005517PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005518PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5519 Py_ssize_t size,
5520 const char *errors,
5521 int byteorder)
5522{
5523 PyObject *result;
5524 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5525 if (tmp == NULL)
5526 return NULL;
5527 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5528 Py_DECREF(tmp);
5529 return result;
5530}
5531
5532PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005533PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005535 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536}
5537
5538/* --- Unicode Escape Codec ----------------------------------------------- */
5539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5541 if all the escapes in the string make it still a valid ASCII string.
5542 Returns -1 if any escapes were found which cause the string to
5543 pop out of ASCII range. Otherwise returns the length of the
5544 required buffer to hold the string.
5545 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005546static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5548{
5549 const unsigned char *p = (const unsigned char *)s;
5550 const unsigned char *end = p + size;
5551 Py_ssize_t length = 0;
5552
5553 if (size < 0)
5554 return -1;
5555
5556 for (; p < end; ++p) {
5557 if (*p > 127) {
5558 /* Non-ASCII */
5559 return -1;
5560 }
5561 else if (*p != '\\') {
5562 /* Normal character */
5563 ++length;
5564 }
5565 else {
5566 /* Backslash-escape, check next char */
5567 ++p;
5568 /* Escape sequence reaches till end of string or
5569 non-ASCII follow-up. */
5570 if (p >= end || *p > 127)
5571 return -1;
5572 switch (*p) {
5573 case '\n':
5574 /* backslash + \n result in zero characters */
5575 break;
5576 case '\\': case '\'': case '\"':
5577 case 'b': case 'f': case 't':
5578 case 'n': case 'r': case 'v': case 'a':
5579 ++length;
5580 break;
5581 case '0': case '1': case '2': case '3':
5582 case '4': case '5': case '6': case '7':
5583 case 'x': case 'u': case 'U': case 'N':
5584 /* these do not guarantee ASCII characters */
5585 return -1;
5586 default:
5587 /* count the backslash + the other character */
5588 length += 2;
5589 }
5590 }
5591 }
5592 return length;
5593}
5594
Fredrik Lundh06d12682001-01-24 07:59:11 +00005595static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005596
Alexander Belopolsky40018472011-02-26 01:02:56 +00005597PyObject *
5598PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005599 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005600 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005603 Py_ssize_t startinpos;
5604 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005605 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005606 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005608 char* message;
5609 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 PyObject *errorHandler = NULL;
5611 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005612 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005614
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005616
5617 /* After length_of_escaped_ascii_string() there are two alternatives,
5618 either the string is pure ASCII with named escapes like \n, etc.
5619 and we determined it's exact size (common case)
5620 or it contains \x, \u, ... escape sequences. then we create a
5621 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005622 if (len >= 0) {
5623 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 if (!v)
5625 goto onError;
5626 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005627 }
5628 else {
5629 /* Escaped strings will always be longer than the resulting
5630 Unicode string, so we start with size here and then reduce the
5631 length after conversion to the true value.
5632 (but if the error callback returns a long replacement string
5633 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005634 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 if (!v)
5636 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005638 }
5639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005641 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 while (s < end) {
5646 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005647 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005650 /* The only case in which i == ascii_length is a backslash
5651 followed by a newline. */
5652 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654 /* Non-escape characters are interpreted as Unicode ordinals */
5655 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005656 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5657 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 continue;
5659 }
5660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* \ - Escapes */
5663 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005664 c = *s++;
5665 if (s > end)
5666 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668 /* The only case in which i == ascii_length is a backslash
5669 followed by a newline. */
5670 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005671
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005672 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675#define WRITECHAR(ch) \
5676 do { \
5677 if (unicode_putchar(&v, &i, ch) < 0) \
5678 goto onError; \
5679 }while(0)
5680
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005682 case '\\': WRITECHAR('\\'); break;
5683 case '\'': WRITECHAR('\''); break;
5684 case '\"': WRITECHAR('\"'); break;
5685 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 case 'f': WRITECHAR('\014'); break;
5688 case 't': WRITECHAR('\t'); break;
5689 case 'n': WRITECHAR('\n'); break;
5690 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005692 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005694 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 case '0': case '1': case '2': case '3':
5698 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005699 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005700 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005701 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005702 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005703 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 break;
5707
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 /* hex escapes */
5709 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005711 digits = 2;
5712 message = "truncated \\xXX escape";
5713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005717 digits = 4;
5718 message = "truncated \\uXXXX escape";
5719 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005722 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005723 digits = 8;
5724 message = "truncated \\UXXXXXXXX escape";
5725 hexescape:
5726 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 if (s+digits>end) {
5728 endinpos = size;
5729 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 errors, &errorHandler,
5731 "unicodeescape", "end of string in escape sequence",
5732 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 goto onError;
5735 goto nextByte;
5736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 for (j = 0; j < digits; ++j) {
5738 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005739 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 errors, &errorHandler,
5743 "unicodeescape", message,
5744 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005746 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005747 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005749 }
5750 chr = (chr<<4) & ~0xF;
5751 if (c >= '0' && c <= '9')
5752 chr += c - '0';
5753 else if (c >= 'a' && c <= 'f')
5754 chr += 10 + c - 'a';
5755 else
5756 chr += 10 + c - 'A';
5757 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005758 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005759 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005760 /* _decoding_error will have already written into the
5761 target buffer. */
5762 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005763 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005764 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005765 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 errors, &errorHandler,
5771 "unicodeescape", "illegal Unicode character",
5772 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005774 goto onError;
5775 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005776 break;
5777
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005779 case 'N':
5780 message = "malformed \\N character escape";
5781 if (ucnhash_CAPI == NULL) {
5782 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5784 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005785 if (ucnhash_CAPI == NULL)
5786 goto ucnhashError;
5787 }
5788 if (*s == '{') {
5789 const char *start = s+1;
5790 /* look for the closing brace */
5791 while (*s != '}' && s < end)
5792 s++;
5793 if (s > start && s < end && *s == '}') {
5794 /* found a name. look it up in the unicode database */
5795 message = "unknown Unicode character name";
5796 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005798 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005799 goto store;
5800 }
5801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 errors, &errorHandler,
5805 "unicodeescape", message,
5806 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005807 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
5810
5811 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005812 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 message = "\\ at end of string";
5814 s--;
5815 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 errors, &errorHandler,
5818 "unicodeescape", message,
5819 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005820 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005821 goto onError;
5822 }
5823 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005824 WRITECHAR('\\');
5825 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005826 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005827 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005834 if (PyUnicode_Resize(&v, i) < 0)
5835 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005836 Py_XDECREF(errorHandler);
5837 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005838 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005841 PyErr_SetString(
5842 PyExc_UnicodeError,
5843 "\\N escapes not supported (can't load unicodedata module)"
5844 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005845 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 Py_XDECREF(errorHandler);
5847 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005848 return NULL;
5849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 Py_XDECREF(errorHandler);
5853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 return NULL;
5855}
5856
5857/* Return a Unicode-Escape string version of the Unicode object.
5858
5859 If quotes is true, the string is enclosed in u"" or u'' quotes as
5860 appropriate.
5861
5862*/
5863
Alexander Belopolsky40018472011-02-26 01:02:56 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 int kind;
5871 void *data;
5872 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Thomas Wouters89f507f2006-12-13 04:49:30 +00005874 /* Initial allocation is based on the longest-possible unichr
5875 escape.
5876
5877 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5878 unichr, so in this case it's the longest unichr escape. In
5879 narrow (UTF-16) builds this is five chars per source unichr
5880 since there are two unichrs in the surrogate pair, so in narrow
5881 (UTF-16) builds it's not the longest unichr escape.
5882
5883 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5884 so in the narrow (UTF-16) build case it's the longest unichr
5885 escape.
5886 */
5887
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005888 if (!PyUnicode_Check(unicode)) {
5889 PyErr_BadArgument();
5890 return NULL;
5891 }
5892 if (PyUnicode_READY(unicode) < 0)
5893 return NULL;
5894 len = PyUnicode_GET_LENGTH(unicode);
5895 kind = PyUnicode_KIND(unicode);
5896 data = PyUnicode_DATA(unicode);
5897 switch(kind) {
5898 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5899 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5900 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5901 }
5902
5903 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005904 return PyBytes_FromStringAndSize(NULL, 0);
5905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005908
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 if (repr == NULL)
5914 return NULL;
5915
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005916 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005919 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005920
Walter Dörwald79e913e2007-05-12 11:08:06 +00005921 /* Escape backslashes */
5922 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 *p++ = '\\';
5924 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005925 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005926 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005927
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005928 /* Map 21-bit characters to '\U00xxxxxx' */
5929 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005930 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005931 *p++ = '\\';
5932 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005933 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5934 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5935 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5936 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5937 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5938 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5940 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005942 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005943
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005945 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 *p++ = '\\';
5947 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005948 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5949 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5950 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5951 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005953
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005954 /* Map special whitespace to '\t', \n', '\r' */
5955 else if (ch == '\t') {
5956 *p++ = '\\';
5957 *p++ = 't';
5958 }
5959 else if (ch == '\n') {
5960 *p++ = '\\';
5961 *p++ = 'n';
5962 }
5963 else if (ch == '\r') {
5964 *p++ = '\\';
5965 *p++ = 'r';
5966 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005967
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005968 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005969 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005971 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005972 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5973 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005974 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005975
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 /* Copy everything else as-is */
5977 else
5978 *p++ = (char) ch;
5979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005981 assert(p - PyBytes_AS_STRING(repr) > 0);
5982 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5983 return NULL;
5984 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985}
5986
Alexander Belopolsky40018472011-02-26 01:02:56 +00005987PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5989 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005991 PyObject *result;
5992 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5993 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 result = PyUnicode_AsUnicodeEscapeString(tmp);
5996 Py_DECREF(tmp);
5997 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998}
5999
6000/* --- Raw Unicode Escape Codec ------------------------------------------- */
6001
Alexander Belopolsky40018472011-02-26 01:02:56 +00006002PyObject *
6003PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006004 Py_ssize_t size,
6005 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 Py_ssize_t startinpos;
6009 Py_ssize_t endinpos;
6010 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006011 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 const char *end;
6013 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 PyObject *errorHandler = NULL;
6015 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 /* Escaped strings will always be longer than the resulting
6018 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006019 length after conversion to the true value. (But decoding error
6020 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006025 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006026 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 end = s + size;
6028 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 unsigned char c;
6030 Py_UCS4 x;
6031 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006032 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* Non-escape characters are interpreted as Unicode ordinals */
6035 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006036 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6037 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006039 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 startinpos = s-starts;
6041
6042 /* \u-escapes are only interpreted iff the number of leading
6043 backslashes if odd */
6044 bs = s;
6045 for (;s < end;) {
6046 if (*s != '\\')
6047 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006048 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6049 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 }
6051 if (((s - bs) & 1) == 0 ||
6052 s >= end ||
6053 (*s != 'u' && *s != 'U')) {
6054 continue;
6055 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006056 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 count = *s=='u' ? 4 : 8;
6058 s++;
6059
6060 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 for (x = 0, i = 0; i < count; ++i, ++s) {
6062 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006063 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 endinpos = s-starts;
6065 if (unicode_decode_call_errorhandler(
6066 errors, &errorHandler,
6067 "rawunicodeescape", "truncated \\uXXXX",
6068 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006069 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 goto onError;
6071 goto nextByte;
6072 }
6073 x = (x<<4) & ~0xF;
6074 if (c >= '0' && c <= '9')
6075 x += c - '0';
6076 else if (c >= 'a' && c <= 'f')
6077 x += 10 + c - 'a';
6078 else
6079 x += 10 + c - 'A';
6080 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006081 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006082 if (unicode_putchar(&v, &outpos, x) < 0)
6083 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006084 } else {
6085 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006086 if (unicode_decode_call_errorhandler(
6087 errors, &errorHandler,
6088 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006092 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 nextByte:
6094 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006096 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 Py_XDECREF(errorHandler);
6099 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006100 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006101
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104 Py_XDECREF(errorHandler);
6105 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 return NULL;
6107}
6108
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006109
Alexander Belopolsky40018472011-02-26 01:02:56 +00006110PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006111PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006113 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 char *p;
6115 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006116 Py_ssize_t expandsize, pos;
6117 int kind;
6118 void *data;
6119 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 if (!PyUnicode_Check(unicode)) {
6122 PyErr_BadArgument();
6123 return NULL;
6124 }
6125 if (PyUnicode_READY(unicode) < 0)
6126 return NULL;
6127 kind = PyUnicode_KIND(unicode);
6128 data = PyUnicode_DATA(unicode);
6129 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006130 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6131 bytes, and 1 byte characters 4. */
6132 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006134 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006136
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 if (repr == NULL)
6139 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006141 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006143 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144 for (pos = 0; pos < len; pos++) {
6145 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 /* Map 32-bit characters to '\Uxxxxxxxx' */
6147 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006148 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006149 *p++ = '\\';
6150 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006151 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6152 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6153 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6154 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6155 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6158 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006159 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006161 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 *p++ = '\\';
6163 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006164 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6165 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6166 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6167 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 /* Copy everything else as-is */
6170 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 *p++ = (char) ch;
6172 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 assert(p > q);
6175 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006176 return NULL;
6177 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178}
6179
Alexander Belopolsky40018472011-02-26 01:02:56 +00006180PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6182 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 PyObject *result;
6185 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6186 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006187 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6189 Py_DECREF(tmp);
6190 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191}
6192
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006193/* --- Unicode Internal Codec ------------------------------------------- */
6194
Alexander Belopolsky40018472011-02-26 01:02:56 +00006195PyObject *
6196_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006197 Py_ssize_t size,
6198 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006199{
6200 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006201 Py_ssize_t startinpos;
6202 Py_ssize_t endinpos;
6203 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006204 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006205 const char *end;
6206 const char *reason;
6207 PyObject *errorHandler = NULL;
6208 PyObject *exc = NULL;
6209
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006210 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006211 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006212 1))
6213 return NULL;
6214
Thomas Wouters89f507f2006-12-13 04:49:30 +00006215 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006216 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006217 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006219 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006220 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006221 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 end = s + size;
6223
6224 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006225 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006226 Py_UCS4 ch;
6227 /* We copy the raw representation one byte at a time because the
6228 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006229 ((char *) &uch)[0] = s[0];
6230 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006231#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006232 ((char *) &uch)[2] = s[2];
6233 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006234#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006235 ch = uch;
6236
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006237 /* We have to sanity check the raw data, otherwise doom looms for
6238 some malformed UCS-4 data. */
6239 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006240#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006241 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006242#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006243 end-s < Py_UNICODE_SIZE
6244 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006246 startinpos = s - starts;
6247 if (end-s < Py_UNICODE_SIZE) {
6248 endinpos = end-starts;
6249 reason = "truncated input";
6250 }
6251 else {
6252 endinpos = s - starts + Py_UNICODE_SIZE;
6253 reason = "illegal code point (> 0x10FFFF)";
6254 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006255 if (unicode_decode_call_errorhandler(
6256 errors, &errorHandler,
6257 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006258 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006259 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006261 continue;
6262 }
6263
6264 s += Py_UNICODE_SIZE;
6265#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006266 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006268 Py_UNICODE uch2;
6269 ((char *) &uch2)[0] = s[0];
6270 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006271 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006272 {
Victor Stinner551ac952011-11-29 22:58:13 +01006273 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006274 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006275 }
6276 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006277#endif
6278
6279 if (unicode_putchar(&v, &outpos, ch) < 0)
6280 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 }
6282
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006283 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284 goto onError;
6285 Py_XDECREF(errorHandler);
6286 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006287 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 Py_XDECREF(v);
6291 Py_XDECREF(errorHandler);
6292 Py_XDECREF(exc);
6293 return NULL;
6294}
6295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296/* --- Latin-1 Codec ------------------------------------------------------ */
6297
Alexander Belopolsky40018472011-02-26 01:02:56 +00006298PyObject *
6299PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006300 Py_ssize_t size,
6301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006304 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305}
6306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006308static void
6309make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006310 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006311 PyObject *unicode,
6312 Py_ssize_t startpos, Py_ssize_t endpos,
6313 const char *reason)
6314{
6315 if (*exceptionObject == NULL) {
6316 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006317 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006318 encoding, unicode, startpos, endpos, reason);
6319 }
6320 else {
6321 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6322 goto onError;
6323 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6324 goto onError;
6325 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6326 goto onError;
6327 return;
6328 onError:
6329 Py_DECREF(*exceptionObject);
6330 *exceptionObject = NULL;
6331 }
6332}
6333
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335static void
6336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006338 PyObject *unicode,
6339 Py_ssize_t startpos, Py_ssize_t endpos,
6340 const char *reason)
6341{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006342 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006343 encoding, unicode, startpos, endpos, reason);
6344 if (*exceptionObject != NULL)
6345 PyCodec_StrictErrors(*exceptionObject);
6346}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347
6348/* error handling callback helper:
6349 build arguments, call the callback and check the arguments,
6350 put the result into newpos and return the replacement string, which
6351 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352static PyObject *
6353unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 PyObject **errorHandler,
6355 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006357 Py_ssize_t startpos, Py_ssize_t endpos,
6358 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006360 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 PyObject *restuple;
6363 PyObject *resunicode;
6364
6365 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 }
6370
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006371 if (PyUnicode_READY(unicode) < 0)
6372 return NULL;
6373 len = PyUnicode_GET_LENGTH(unicode);
6374
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006375 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006376 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379
6380 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006385 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 Py_DECREF(restuple);
6387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006389 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 &resunicode, newpos)) {
6391 Py_DECREF(restuple);
6392 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006394 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6395 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6396 Py_DECREF(restuple);
6397 return NULL;
6398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 *newpos = len + *newpos;
6401 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6403 Py_DECREF(restuple);
6404 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006405 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 Py_INCREF(resunicode);
6407 Py_DECREF(restuple);
6408 return resunicode;
6409}
6410
Alexander Belopolsky40018472011-02-26 01:02:56 +00006411static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006413 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006414 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 /* input state */
6417 Py_ssize_t pos=0, size;
6418 int kind;
6419 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 /* output object */
6421 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 /* pointer into the output */
6423 char *str;
6424 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006425 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006426 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6427 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 PyObject *errorHandler = NULL;
6429 PyObject *exc = NULL;
6430 /* the following variable is used for caching string comparisons
6431 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6432 int known_errorHandler = -1;
6433
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006434 if (PyUnicode_READY(unicode) < 0)
6435 return NULL;
6436 size = PyUnicode_GET_LENGTH(unicode);
6437 kind = PyUnicode_KIND(unicode);
6438 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 /* allocate enough for a simple encoding without
6440 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006441 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006442 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006443 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006445 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006446 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 ressize = size;
6448
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 while (pos < size) {
6450 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* can we encode this? */
6453 if (c<limit) {
6454 /* no overflow check, because we know that the space is enough */
6455 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006457 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 Py_ssize_t requiredsize;
6460 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463 Py_ssize_t collstart = pos;
6464 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 ++collend;
6468 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6469 if (known_errorHandler==-1) {
6470 if ((errors==NULL) || (!strcmp(errors, "strict")))
6471 known_errorHandler = 1;
6472 else if (!strcmp(errors, "replace"))
6473 known_errorHandler = 2;
6474 else if (!strcmp(errors, "ignore"))
6475 known_errorHandler = 3;
6476 else if (!strcmp(errors, "xmlcharrefreplace"))
6477 known_errorHandler = 4;
6478 else
6479 known_errorHandler = 0;
6480 }
6481 switch (known_errorHandler) {
6482 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006483 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 goto onError;
6485 case 2: /* replace */
6486 while (collstart++<collend)
6487 *str++ = '?'; /* fall through */
6488 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 break;
6491 case 4: /* xmlcharrefreplace */
6492 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 /* determine replacement size */
6494 for (i = collstart, repsize = 0; i < collend; ++i) {
6495 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6496 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006508 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006509 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 if (requiredsize > ressize) {
6515 if (requiredsize<2*ressize)
6516 requiredsize = 2*ressize;
6517 if (_PyBytes_Resize(&res, requiredsize))
6518 goto onError;
6519 str = PyBytes_AS_STRING(res) + respos;
6520 ressize = requiredsize;
6521 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522 /* generate replacement */
6523 for (i = collstart; i < collend; ++i) {
6524 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006526 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 break;
6528 default:
6529 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 encoding, reason, unicode, &exc,
6531 collstart, collend, &newpos);
6532 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6533 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006535 if (PyBytes_Check(repunicode)) {
6536 /* Directly copy bytes result to output. */
6537 repsize = PyBytes_Size(repunicode);
6538 if (repsize > 1) {
6539 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006540 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006541 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6542 Py_DECREF(repunicode);
6543 goto onError;
6544 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006545 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006546 ressize += repsize-1;
6547 }
6548 memcpy(str, PyBytes_AsString(repunicode), repsize);
6549 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006550 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006551 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006552 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 /* need more space? (at least enough for what we
6555 have+the replacement+the rest of the string, so
6556 we won't have to check space for encodable characters) */
6557 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 repsize = PyUnicode_GET_LENGTH(repunicode);
6559 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 if (requiredsize > ressize) {
6561 if (requiredsize<2*ressize)
6562 requiredsize = 2*ressize;
6563 if (_PyBytes_Resize(&res, requiredsize)) {
6564 Py_DECREF(repunicode);
6565 goto onError;
6566 }
6567 str = PyBytes_AS_STRING(res) + respos;
6568 ressize = requiredsize;
6569 }
6570 /* check if there is anything unencodable in the replacement
6571 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 for (i = 0; repsize-->0; ++i, ++str) {
6573 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006575 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 Py_DECREF(repunicode);
6578 goto onError;
6579 }
6580 *str = (char)c;
6581 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006582 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006583 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 }
6586 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006587 /* Resize if we allocated to much */
6588 size = str - PyBytes_AS_STRING(res);
6589 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006590 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006591 if (_PyBytes_Resize(&res, size) < 0)
6592 goto onError;
6593 }
6594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006597 return res;
6598
6599 onError:
6600 Py_XDECREF(res);
6601 Py_XDECREF(errorHandler);
6602 Py_XDECREF(exc);
6603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604}
6605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006607PyObject *
6608PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006609 Py_ssize_t size,
6610 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 PyObject *result;
6613 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6614 if (unicode == NULL)
6615 return NULL;
6616 result = unicode_encode_ucs1(unicode, errors, 256);
6617 Py_DECREF(unicode);
6618 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006622_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
6624 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 PyErr_BadArgument();
6626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006628 if (PyUnicode_READY(unicode) == -1)
6629 return NULL;
6630 /* Fast path: if it is a one-byte string, construct
6631 bytes object directly. */
6632 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6633 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6634 PyUnicode_GET_LENGTH(unicode));
6635 /* Non-Latin-1 characters present. Defer to above function to
6636 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638}
6639
6640PyObject*
6641PyUnicode_AsLatin1String(PyObject *unicode)
6642{
6643 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
6646/* --- 7-bit ASCII Codec -------------------------------------------------- */
6647
Alexander Belopolsky40018472011-02-26 01:02:56 +00006648PyObject *
6649PyUnicode_DecodeASCII(const char *s,
6650 Py_ssize_t size,
6651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006654 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006655 int kind;
6656 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006657 Py_ssize_t startinpos;
6658 Py_ssize_t endinpos;
6659 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006661 int has_error;
6662 const unsigned char *p = (const unsigned char *)s;
6663 const unsigned char *end = p + size;
6664 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 PyObject *errorHandler = NULL;
6666 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006668 if (size == 0) {
6669 Py_INCREF(unicode_empty);
6670 return unicode_empty;
6671 }
6672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006674 if (size == 1 && (unsigned char)s[0] < 128)
6675 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676
Victor Stinner702c7342011-10-05 13:50:52 +02006677 has_error = 0;
6678 while (p < end && !has_error) {
6679 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6680 an explanation. */
6681 if (!((size_t) p & LONG_PTR_MASK)) {
6682 /* Help register allocation */
6683 register const unsigned char *_p = p;
6684 while (_p < aligned_end) {
6685 unsigned long value = *(unsigned long *) _p;
6686 if (value & ASCII_CHAR_MASK) {
6687 has_error = 1;
6688 break;
6689 }
6690 _p += SIZEOF_LONG;
6691 }
6692 if (_p == end)
6693 break;
6694 if (has_error)
6695 break;
6696 p = _p;
6697 }
6698 if (*p & 0x80) {
6699 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006700 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006701 }
6702 else {
6703 ++p;
6704 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006705 }
Victor Stinner702c7342011-10-05 13:50:52 +02006706 if (!has_error)
6707 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006708
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006709 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006713 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006714 kind = PyUnicode_KIND(v);
6715 data = PyUnicode_DATA(v);
6716 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 e = s + size;
6718 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 register unsigned char c = (unsigned char)*s;
6720 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006721 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 ++s;
6723 }
6724 else {
6725 startinpos = s-starts;
6726 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 if (unicode_decode_call_errorhandler(
6728 errors, &errorHandler,
6729 "ascii", "ordinal not in range(128)",
6730 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006731 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006733 kind = PyUnicode_KIND(v);
6734 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006737 if (PyUnicode_Resize(&v, outpos) < 0)
6738 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 Py_XDECREF(errorHandler);
6740 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006741 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006742 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006743
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 Py_XDECREF(errorHandler);
6747 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 return NULL;
6749}
6750
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006752PyObject *
6753PyUnicode_EncodeASCII(const Py_UNICODE *p,
6754 Py_ssize_t size,
6755 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757 PyObject *result;
6758 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6759 if (unicode == NULL)
6760 return NULL;
6761 result = unicode_encode_ucs1(unicode, errors, 128);
6762 Py_DECREF(unicode);
6763 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764}
6765
Alexander Belopolsky40018472011-02-26 01:02:56 +00006766PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006767_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768{
6769 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 PyErr_BadArgument();
6771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006773 if (PyUnicode_READY(unicode) == -1)
6774 return NULL;
6775 /* Fast path: if it is an ASCII-only string, construct bytes object
6776 directly. Else defer to above function to raise the exception. */
6777 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6778 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6779 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006781}
6782
6783PyObject *
6784PyUnicode_AsASCIIString(PyObject *unicode)
6785{
6786 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787}
6788
Victor Stinner99b95382011-07-04 14:23:54 +02006789#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006790
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006791/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006793#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794#define NEED_RETRY
6795#endif
6796
Victor Stinner3a50e702011-10-18 21:21:00 +02006797#ifndef WC_ERR_INVALID_CHARS
6798# define WC_ERR_INVALID_CHARS 0x0080
6799#endif
6800
6801static char*
6802code_page_name(UINT code_page, PyObject **obj)
6803{
6804 *obj = NULL;
6805 if (code_page == CP_ACP)
6806 return "mbcs";
6807 if (code_page == CP_UTF7)
6808 return "CP_UTF7";
6809 if (code_page == CP_UTF8)
6810 return "CP_UTF8";
6811
6812 *obj = PyBytes_FromFormat("cp%u", code_page);
6813 if (*obj == NULL)
6814 return NULL;
6815 return PyBytes_AS_STRING(*obj);
6816}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817
Alexander Belopolsky40018472011-02-26 01:02:56 +00006818static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006819is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820{
6821 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006822 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823
Victor Stinner3a50e702011-10-18 21:21:00 +02006824 if (!IsDBCSLeadByteEx(code_page, *curr))
6825 return 0;
6826
6827 prev = CharPrevExA(code_page, s, curr, 0);
6828 if (prev == curr)
6829 return 1;
6830 /* FIXME: This code is limited to "true" double-byte encodings,
6831 as it assumes an incomplete character consists of a single
6832 byte. */
6833 if (curr - prev == 2)
6834 return 1;
6835 if (!IsDBCSLeadByteEx(code_page, *prev))
6836 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 return 0;
6838}
6839
Victor Stinner3a50e702011-10-18 21:21:00 +02006840static DWORD
6841decode_code_page_flags(UINT code_page)
6842{
6843 if (code_page == CP_UTF7) {
6844 /* The CP_UTF7 decoder only supports flags=0 */
6845 return 0;
6846 }
6847 else
6848 return MB_ERR_INVALID_CHARS;
6849}
6850
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 * Decode a byte string from a Windows code page into unicode object in strict
6853 * mode.
6854 *
6855 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6856 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006858static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006859decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006860 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006861 const char *in,
6862 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863{
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006865 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867
6868 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 assert(insize > 0);
6870 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6871 if (outsize <= 0)
6872 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873
6874 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006876 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 if (*v == NULL)
6878 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006879 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 }
6881 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006884 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006886 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887 }
6888
6889 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006890 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6891 if (outsize <= 0)
6892 goto error;
6893 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006894
Victor Stinner3a50e702011-10-18 21:21:00 +02006895error:
6896 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6897 return -2;
6898 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006899 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900}
6901
Victor Stinner3a50e702011-10-18 21:21:00 +02006902/*
6903 * Decode a byte string from a code page into unicode object with an error
6904 * handler.
6905 *
6906 * Returns consumed size if succeed, or raise a WindowsError or
6907 * UnicodeDecodeError exception and returns -1 on error.
6908 */
6909static int
6910decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006911 PyObject **v,
6912 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 const char *errors)
6914{
6915 const char *startin = in;
6916 const char *endin = in + size;
6917 const DWORD flags = decode_code_page_flags(code_page);
6918 /* Ideally, we should get reason from FormatMessage. This is the Windows
6919 2000 English version of the message. */
6920 const char *reason = "No mapping for the Unicode character exists "
6921 "in the target code page.";
6922 /* each step cannot decode more than 1 character, but a character can be
6923 represented as a surrogate pair */
6924 wchar_t buffer[2], *startout, *out;
6925 int insize, outsize;
6926 PyObject *errorHandler = NULL;
6927 PyObject *exc = NULL;
6928 PyObject *encoding_obj = NULL;
6929 char *encoding;
6930 DWORD err;
6931 int ret = -1;
6932
6933 assert(size > 0);
6934
6935 encoding = code_page_name(code_page, &encoding_obj);
6936 if (encoding == NULL)
6937 return -1;
6938
6939 if (errors == NULL || strcmp(errors, "strict") == 0) {
6940 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6941 UnicodeDecodeError. */
6942 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6943 if (exc != NULL) {
6944 PyCodec_StrictErrors(exc);
6945 Py_CLEAR(exc);
6946 }
6947 goto error;
6948 }
6949
6950 if (*v == NULL) {
6951 /* Create unicode object */
6952 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6953 PyErr_NoMemory();
6954 goto error;
6955 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006956 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 if (*v == NULL)
6958 goto error;
6959 startout = PyUnicode_AS_UNICODE(*v);
6960 }
6961 else {
6962 /* Extend unicode object */
6963 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6964 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6965 PyErr_NoMemory();
6966 goto error;
6967 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006968 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 goto error;
6970 startout = PyUnicode_AS_UNICODE(*v) + n;
6971 }
6972
6973 /* Decode the byte string character per character */
6974 out = startout;
6975 while (in < endin)
6976 {
6977 /* Decode a character */
6978 insize = 1;
6979 do
6980 {
6981 outsize = MultiByteToWideChar(code_page, flags,
6982 in, insize,
6983 buffer, Py_ARRAY_LENGTH(buffer));
6984 if (outsize > 0)
6985 break;
6986 err = GetLastError();
6987 if (err != ERROR_NO_UNICODE_TRANSLATION
6988 && err != ERROR_INSUFFICIENT_BUFFER)
6989 {
6990 PyErr_SetFromWindowsErr(0);
6991 goto error;
6992 }
6993 insize++;
6994 }
6995 /* 4=maximum length of a UTF-8 sequence */
6996 while (insize <= 4 && (in + insize) <= endin);
6997
6998 if (outsize <= 0) {
6999 Py_ssize_t startinpos, endinpos, outpos;
7000
7001 startinpos = in - startin;
7002 endinpos = startinpos + 1;
7003 outpos = out - PyUnicode_AS_UNICODE(*v);
7004 if (unicode_decode_call_errorhandler(
7005 errors, &errorHandler,
7006 encoding, reason,
7007 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007008 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 {
7010 goto error;
7011 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007012 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 }
7014 else {
7015 in += insize;
7016 memcpy(out, buffer, outsize * sizeof(wchar_t));
7017 out += outsize;
7018 }
7019 }
7020
7021 /* write a NUL character at the end */
7022 *out = 0;
7023
7024 /* Extend unicode object */
7025 outsize = out - startout;
7026 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007030
7031error:
7032 Py_XDECREF(encoding_obj);
7033 Py_XDECREF(errorHandler);
7034 Py_XDECREF(exc);
7035 return ret;
7036}
7037
Victor Stinner3a50e702011-10-18 21:21:00 +02007038static PyObject *
7039decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007040 const char *s, Py_ssize_t size,
7041 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007042{
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 PyObject *v = NULL;
7044 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045
Victor Stinner3a50e702011-10-18 21:21:00 +02007046 if (code_page < 0) {
7047 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7048 return NULL;
7049 }
7050
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053
Victor Stinner76a31a62011-11-04 00:05:13 +01007054 do
7055 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007057 if (size > INT_MAX) {
7058 chunk_size = INT_MAX;
7059 final = 0;
7060 done = 0;
7061 }
7062 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 {
7065 chunk_size = (int)size;
7066 final = (consumed == NULL);
7067 done = 1;
7068 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007069
Victor Stinner76a31a62011-11-04 00:05:13 +01007070 /* Skip trailing lead-byte unless 'final' is set */
7071 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7072 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 if (chunk_size == 0 && done) {
7075 if (v != NULL)
7076 break;
7077 Py_INCREF(unicode_empty);
7078 return unicode_empty;
7079 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Victor Stinner76a31a62011-11-04 00:05:13 +01007081
7082 converted = decode_code_page_strict(code_page, &v,
7083 s, chunk_size);
7084 if (converted == -2)
7085 converted = decode_code_page_errors(code_page, &v,
7086 s, chunk_size,
7087 errors);
7088 assert(converted != 0);
7089
7090 if (converted < 0) {
7091 Py_XDECREF(v);
7092 return NULL;
7093 }
7094
7095 if (consumed)
7096 *consumed += converted;
7097
7098 s += converted;
7099 size -= converted;
7100 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007101
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007102 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103}
7104
Alexander Belopolsky40018472011-02-26 01:02:56 +00007105PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007106PyUnicode_DecodeCodePageStateful(int code_page,
7107 const char *s,
7108 Py_ssize_t size,
7109 const char *errors,
7110 Py_ssize_t *consumed)
7111{
7112 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7113}
7114
7115PyObject *
7116PyUnicode_DecodeMBCSStateful(const char *s,
7117 Py_ssize_t size,
7118 const char *errors,
7119 Py_ssize_t *consumed)
7120{
7121 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7122}
7123
7124PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007125PyUnicode_DecodeMBCS(const char *s,
7126 Py_ssize_t size,
7127 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007128{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7130}
7131
Victor Stinner3a50e702011-10-18 21:21:00 +02007132static DWORD
7133encode_code_page_flags(UINT code_page, const char *errors)
7134{
7135 if (code_page == CP_UTF8) {
7136 if (winver.dwMajorVersion >= 6)
7137 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7138 and later */
7139 return WC_ERR_INVALID_CHARS;
7140 else
7141 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7142 return 0;
7143 }
7144 else if (code_page == CP_UTF7) {
7145 /* CP_UTF7 only supports flags=0 */
7146 return 0;
7147 }
7148 else {
7149 if (errors != NULL && strcmp(errors, "replace") == 0)
7150 return 0;
7151 else
7152 return WC_NO_BEST_FIT_CHARS;
7153 }
7154}
7155
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 * Encode a Unicode string to a Windows code page into a byte string in strict
7158 * mode.
7159 *
7160 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7161 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007164encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007165 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167{
Victor Stinner554f3f02010-06-16 23:33:54 +00007168 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 BOOL *pusedDefaultChar = &usedDefaultChar;
7170 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007171 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007172 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007173 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 const DWORD flags = encode_code_page_flags(code_page, NULL);
7175 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007176 /* Create a substring so that we can get the UTF-16 representation
7177 of just the slice under consideration. */
7178 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179
Martin v. Löwis3d325192011-11-04 18:23:06 +01007180 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007181
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007183 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007185 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007186
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187 substring = PyUnicode_Substring(unicode, offset, offset+len);
7188 if (substring == NULL)
7189 return -1;
7190 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7191 if (p == NULL) {
7192 Py_DECREF(substring);
7193 return -1;
7194 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007196 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 outsize = WideCharToMultiByte(code_page, flags,
7198 p, size,
7199 NULL, 0,
7200 NULL, pusedDefaultChar);
7201 if (outsize <= 0)
7202 goto error;
7203 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007204 if (pusedDefaultChar && *pusedDefaultChar) {
7205 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007207 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 if (*outbytes == NULL) {
7213 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007215 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217 }
7218 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 const Py_ssize_t n = PyBytes_Size(*outbytes);
7221 if (outsize > PY_SSIZE_T_MAX - n) {
7222 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007223 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007226 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7227 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007229 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007231 }
7232
7233 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 outsize = WideCharToMultiByte(code_page, flags,
7235 p, size,
7236 out, outsize,
7237 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 if (outsize <= 0)
7240 goto error;
7241 if (pusedDefaultChar && *pusedDefaultChar)
7242 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007243 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007244
Victor Stinner3a50e702011-10-18 21:21:00 +02007245error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7248 return -2;
7249 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007250 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007251}
7252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253/*
7254 * Encode a Unicode string to a Windows code page into a byte string using a
7255 * error handler.
7256 *
7257 * Returns consumed characters if succeed, or raise a WindowsError and returns
7258 * -1 on other error.
7259 */
7260static int
7261encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007262 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007263 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007264{
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007266 Py_ssize_t pos = unicode_offset;
7267 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 /* Ideally, we should get reason from FormatMessage. This is the Windows
7269 2000 English version of the message. */
7270 const char *reason = "invalid character";
7271 /* 4=maximum length of a UTF-8 sequence */
7272 char buffer[4];
7273 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7274 Py_ssize_t outsize;
7275 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 PyObject *errorHandler = NULL;
7277 PyObject *exc = NULL;
7278 PyObject *encoding_obj = NULL;
7279 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007280 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 PyObject *rep;
7282 int ret = -1;
7283
7284 assert(insize > 0);
7285
7286 encoding = code_page_name(code_page, &encoding_obj);
7287 if (encoding == NULL)
7288 return -1;
7289
7290 if (errors == NULL || strcmp(errors, "strict") == 0) {
7291 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7292 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007293 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (exc != NULL) {
7295 PyCodec_StrictErrors(exc);
7296 Py_DECREF(exc);
7297 }
7298 Py_XDECREF(encoding_obj);
7299 return -1;
7300 }
7301
7302 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7303 pusedDefaultChar = &usedDefaultChar;
7304 else
7305 pusedDefaultChar = NULL;
7306
7307 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7308 PyErr_NoMemory();
7309 goto error;
7310 }
7311 outsize = insize * Py_ARRAY_LENGTH(buffer);
7312
7313 if (*outbytes == NULL) {
7314 /* Create string object */
7315 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7316 if (*outbytes == NULL)
7317 goto error;
7318 out = PyBytes_AS_STRING(*outbytes);
7319 }
7320 else {
7321 /* Extend string object */
7322 Py_ssize_t n = PyBytes_Size(*outbytes);
7323 if (n > PY_SSIZE_T_MAX - outsize) {
7324 PyErr_NoMemory();
7325 goto error;
7326 }
7327 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7328 goto error;
7329 out = PyBytes_AS_STRING(*outbytes) + n;
7330 }
7331
7332 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007333 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007335 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7336 wchar_t chars[2];
7337 int charsize;
7338 if (ch < 0x10000) {
7339 chars[0] = (wchar_t)ch;
7340 charsize = 1;
7341 }
7342 else {
7343 ch -= 0x10000;
7344 chars[0] = 0xd800 + (ch >> 10);
7345 chars[1] = 0xdc00 + (ch & 0x3ff);
7346 charsize = 2;
7347 }
7348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007350 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 buffer, Py_ARRAY_LENGTH(buffer),
7352 NULL, pusedDefaultChar);
7353 if (outsize > 0) {
7354 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7355 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 memcpy(out, buffer, outsize);
7358 out += outsize;
7359 continue;
7360 }
7361 }
7362 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7363 PyErr_SetFromWindowsErr(0);
7364 goto error;
7365 }
7366
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 rep = unicode_encode_call_errorhandler(
7368 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007369 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007370 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 if (rep == NULL)
7372 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007374
7375 if (PyBytes_Check(rep)) {
7376 outsize = PyBytes_GET_SIZE(rep);
7377 if (outsize != 1) {
7378 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7379 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7380 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7381 Py_DECREF(rep);
7382 goto error;
7383 }
7384 out = PyBytes_AS_STRING(*outbytes) + offset;
7385 }
7386 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7387 out += outsize;
7388 }
7389 else {
7390 Py_ssize_t i;
7391 enum PyUnicode_Kind kind;
7392 void *data;
7393
7394 if (PyUnicode_READY(rep) < 0) {
7395 Py_DECREF(rep);
7396 goto error;
7397 }
7398
7399 outsize = PyUnicode_GET_LENGTH(rep);
7400 if (outsize != 1) {
7401 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7402 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7403 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7404 Py_DECREF(rep);
7405 goto error;
7406 }
7407 out = PyBytes_AS_STRING(*outbytes) + offset;
7408 }
7409 kind = PyUnicode_KIND(rep);
7410 data = PyUnicode_DATA(rep);
7411 for (i=0; i < outsize; i++) {
7412 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7413 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007414 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007415 encoding, unicode,
7416 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007417 "unable to encode error handler result to ASCII");
7418 Py_DECREF(rep);
7419 goto error;
7420 }
7421 *out = (unsigned char)ch;
7422 out++;
7423 }
7424 }
7425 Py_DECREF(rep);
7426 }
7427 /* write a NUL byte */
7428 *out = 0;
7429 outsize = out - PyBytes_AS_STRING(*outbytes);
7430 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7431 if (_PyBytes_Resize(outbytes, outsize) < 0)
7432 goto error;
7433 ret = 0;
7434
7435error:
7436 Py_XDECREF(encoding_obj);
7437 Py_XDECREF(errorHandler);
7438 Py_XDECREF(exc);
7439 return ret;
7440}
7441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442static PyObject *
7443encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007444 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 const char *errors)
7446{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007447 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007449 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007450 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007451
Victor Stinner2fc507f2011-11-04 20:06:39 +01007452 if (PyUnicode_READY(unicode) < 0)
7453 return NULL;
7454 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007455
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 if (code_page < 0) {
7457 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7458 return NULL;
7459 }
7460
Martin v. Löwis3d325192011-11-04 18:23:06 +01007461 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 return PyBytes_FromStringAndSize(NULL, 0);
7463
Victor Stinner7581cef2011-11-03 22:32:33 +01007464 offset = 0;
7465 do
7466 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007467#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 chunks. */
7470 if (len > INT_MAX/2) {
7471 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007472 done = 0;
7473 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007477 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007478 done = 1;
7479 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007480
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007483 errors);
7484 if (ret == -2)
7485 ret = encode_code_page_errors(code_page, &outbytes,
7486 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007488 if (ret < 0) {
7489 Py_XDECREF(outbytes);
7490 return NULL;
7491 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007492
Victor Stinner7581cef2011-11-03 22:32:33 +01007493 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007495 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 return outbytes;
7498}
7499
7500PyObject *
7501PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7502 Py_ssize_t size,
7503 const char *errors)
7504{
Victor Stinner7581cef2011-11-03 22:32:33 +01007505 PyObject *unicode, *res;
7506 unicode = PyUnicode_FromUnicode(p, size);
7507 if (unicode == NULL)
7508 return NULL;
7509 res = encode_code_page(CP_ACP, unicode, errors);
7510 Py_DECREF(unicode);
7511 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007512}
7513
7514PyObject *
7515PyUnicode_EncodeCodePage(int code_page,
7516 PyObject *unicode,
7517 const char *errors)
7518{
Victor Stinner7581cef2011-11-03 22:32:33 +01007519 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007520}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007521
Alexander Belopolsky40018472011-02-26 01:02:56 +00007522PyObject *
7523PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007524{
7525 if (!PyUnicode_Check(unicode)) {
7526 PyErr_BadArgument();
7527 return NULL;
7528 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007529 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007530}
7531
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532#undef NEED_RETRY
7533
Victor Stinner99b95382011-07-04 14:23:54 +02007534#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007535
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536/* --- Character Mapping Codec -------------------------------------------- */
7537
Alexander Belopolsky40018472011-02-26 01:02:56 +00007538PyObject *
7539PyUnicode_DecodeCharmap(const char *s,
7540 Py_ssize_t size,
7541 PyObject *mapping,
7542 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007544 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007545 Py_ssize_t startinpos;
7546 Py_ssize_t endinpos;
7547 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007548 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007549 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007551 PyObject *errorHandler = NULL;
7552 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007553
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 /* Default to Latin-1 */
7555 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007558 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007562 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007563 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007565 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007566 Py_ssize_t maplen;
7567 enum PyUnicode_Kind kind;
7568 void *data;
7569 Py_UCS4 x;
7570
7571 if (PyUnicode_READY(mapping) < 0)
7572 return NULL;
7573
7574 maplen = PyUnicode_GET_LENGTH(mapping);
7575 data = PyUnicode_DATA(mapping);
7576 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 while (s < e) {
7578 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007581 x = PyUnicode_READ(kind, data, ch);
7582 else
7583 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007585 if (x == 0xfffe)
7586 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 startinpos = s-starts;
7589 endinpos = startinpos+1;
7590 if (unicode_decode_call_errorhandler(
7591 errors, &errorHandler,
7592 "charmap", "character maps to <undefined>",
7593 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007594 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 goto onError;
7596 }
7597 continue;
7598 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007599
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007600 if (unicode_putchar(&v, &outpos, x) < 0)
7601 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007603 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007604 }
7605 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 while (s < e) {
7607 unsigned char ch = *s;
7608 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007609
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7611 w = PyLong_FromLong((long)ch);
7612 if (w == NULL)
7613 goto onError;
7614 x = PyObject_GetItem(mapping, w);
7615 Py_DECREF(w);
7616 if (x == NULL) {
7617 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7618 /* No mapping found means: mapping is undefined. */
7619 PyErr_Clear();
7620 x = Py_None;
7621 Py_INCREF(x);
7622 } else
7623 goto onError;
7624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007625
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 /* Apply mapping */
7627 if (PyLong_Check(x)) {
7628 long value = PyLong_AS_LONG(x);
7629 if (value < 0 || value > 65535) {
7630 PyErr_SetString(PyExc_TypeError,
7631 "character mapping must be in range(65536)");
7632 Py_DECREF(x);
7633 goto onError;
7634 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007635 if (unicode_putchar(&v, &outpos, value) < 0)
7636 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 }
7638 else if (x == Py_None) {
7639 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 startinpos = s-starts;
7641 endinpos = startinpos+1;
7642 if (unicode_decode_call_errorhandler(
7643 errors, &errorHandler,
7644 "charmap", "character maps to <undefined>",
7645 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007646 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 Py_DECREF(x);
7648 goto onError;
7649 }
7650 Py_DECREF(x);
7651 continue;
7652 }
7653 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007654 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007655
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007656 if (PyUnicode_READY(x) < 0)
7657 goto onError;
7658 targetsize = PyUnicode_GET_LENGTH(x);
7659
7660 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007662 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007663 PyUnicode_READ_CHAR(x, 0)) < 0)
7664 goto onError;
7665 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 else if (targetsize > 1) {
7667 /* 1-n mapping */
7668 if (targetsize > extrachars) {
7669 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 Py_ssize_t needed = (targetsize - extrachars) + \
7671 (targetsize << 2);
7672 extrachars += needed;
7673 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007674 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007675 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 Py_DECREF(x);
7677 goto onError;
7678 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007680 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7681 goto onError;
7682 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7683 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 extrachars -= targetsize;
7685 }
7686 /* 1-0 mapping: skip the character */
7687 }
7688 else {
7689 /* wrong return value */
7690 PyErr_SetString(PyExc_TypeError,
7691 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007692 Py_DECREF(x);
7693 goto onError;
7694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 Py_DECREF(x);
7696 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007699 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007700 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701 Py_XDECREF(errorHandler);
7702 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007703 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007704
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 Py_XDECREF(errorHandler);
7707 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 Py_XDECREF(v);
7709 return NULL;
7710}
7711
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007712/* Charmap encoding: the lookup table */
7713
Alexander Belopolsky40018472011-02-26 01:02:56 +00007714struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 PyObject_HEAD
7716 unsigned char level1[32];
7717 int count2, count3;
7718 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719};
7720
7721static PyObject*
7722encoding_map_size(PyObject *obj, PyObject* args)
7723{
7724 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007727}
7728
7729static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 PyDoc_STR("Return the size (in bytes) of this object") },
7732 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733};
7734
7735static void
7736encoding_map_dealloc(PyObject* o)
7737{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007738 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007739}
7740
7741static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007742 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 "EncodingMap", /*tp_name*/
7744 sizeof(struct encoding_map), /*tp_basicsize*/
7745 0, /*tp_itemsize*/
7746 /* methods */
7747 encoding_map_dealloc, /*tp_dealloc*/
7748 0, /*tp_print*/
7749 0, /*tp_getattr*/
7750 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007751 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 0, /*tp_repr*/
7753 0, /*tp_as_number*/
7754 0, /*tp_as_sequence*/
7755 0, /*tp_as_mapping*/
7756 0, /*tp_hash*/
7757 0, /*tp_call*/
7758 0, /*tp_str*/
7759 0, /*tp_getattro*/
7760 0, /*tp_setattro*/
7761 0, /*tp_as_buffer*/
7762 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7763 0, /*tp_doc*/
7764 0, /*tp_traverse*/
7765 0, /*tp_clear*/
7766 0, /*tp_richcompare*/
7767 0, /*tp_weaklistoffset*/
7768 0, /*tp_iter*/
7769 0, /*tp_iternext*/
7770 encoding_map_methods, /*tp_methods*/
7771 0, /*tp_members*/
7772 0, /*tp_getset*/
7773 0, /*tp_base*/
7774 0, /*tp_dict*/
7775 0, /*tp_descr_get*/
7776 0, /*tp_descr_set*/
7777 0, /*tp_dictoffset*/
7778 0, /*tp_init*/
7779 0, /*tp_alloc*/
7780 0, /*tp_new*/
7781 0, /*tp_free*/
7782 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783};
7784
7785PyObject*
7786PyUnicode_BuildEncodingMap(PyObject* string)
7787{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788 PyObject *result;
7789 struct encoding_map *mresult;
7790 int i;
7791 int need_dict = 0;
7792 unsigned char level1[32];
7793 unsigned char level2[512];
7794 unsigned char *mlevel1, *mlevel2, *mlevel3;
7795 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 int kind;
7797 void *data;
7798 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007800 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007801 PyErr_BadArgument();
7802 return NULL;
7803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 kind = PyUnicode_KIND(string);
7805 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007806 memset(level1, 0xFF, sizeof level1);
7807 memset(level2, 0xFF, sizeof level2);
7808
7809 /* If there isn't a one-to-one mapping of NULL to \0,
7810 or if there are non-BMP characters, we need to use
7811 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007812 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813 need_dict = 1;
7814 for (i = 1; i < 256; i++) {
7815 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 ch = PyUnicode_READ(kind, data, i);
7817 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818 need_dict = 1;
7819 break;
7820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007822 /* unmapped character */
7823 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 l1 = ch >> 11;
7825 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007826 if (level1[l1] == 0xFF)
7827 level1[l1] = count2++;
7828 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 }
7831
7832 if (count2 >= 0xFF || count3 >= 0xFF)
7833 need_dict = 1;
7834
7835 if (need_dict) {
7836 PyObject *result = PyDict_New();
7837 PyObject *key, *value;
7838 if (!result)
7839 return NULL;
7840 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007842 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 if (!key || !value)
7844 goto failed1;
7845 if (PyDict_SetItem(result, key, value) == -1)
7846 goto failed1;
7847 Py_DECREF(key);
7848 Py_DECREF(value);
7849 }
7850 return result;
7851 failed1:
7852 Py_XDECREF(key);
7853 Py_XDECREF(value);
7854 Py_DECREF(result);
7855 return NULL;
7856 }
7857
7858 /* Create a three-level trie */
7859 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7860 16*count2 + 128*count3 - 1);
7861 if (!result)
7862 return PyErr_NoMemory();
7863 PyObject_Init(result, &EncodingMapType);
7864 mresult = (struct encoding_map*)result;
7865 mresult->count2 = count2;
7866 mresult->count3 = count3;
7867 mlevel1 = mresult->level1;
7868 mlevel2 = mresult->level23;
7869 mlevel3 = mresult->level23 + 16*count2;
7870 memcpy(mlevel1, level1, 32);
7871 memset(mlevel2, 0xFF, 16*count2);
7872 memset(mlevel3, 0, 128*count3);
7873 count3 = 0;
7874 for (i = 1; i < 256; i++) {
7875 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877 /* unmapped character */
7878 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 o1 = PyUnicode_READ(kind, data, i)>>11;
7880 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 i2 = 16*mlevel1[o1] + o2;
7882 if (mlevel2[i2] == 0xFF)
7883 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 i3 = 128*mlevel2[i2] + o3;
7886 mlevel3[i3] = i;
7887 }
7888 return result;
7889}
7890
7891static int
Victor Stinner22168992011-11-20 17:09:18 +01007892encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893{
7894 struct encoding_map *map = (struct encoding_map*)mapping;
7895 int l1 = c>>11;
7896 int l2 = (c>>7) & 0xF;
7897 int l3 = c & 0x7F;
7898 int i;
7899
Victor Stinner22168992011-11-20 17:09:18 +01007900 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007902 if (c == 0)
7903 return 0;
7904 /* level 1*/
7905 i = map->level1[l1];
7906 if (i == 0xFF) {
7907 return -1;
7908 }
7909 /* level 2*/
7910 i = map->level23[16*i+l2];
7911 if (i == 0xFF) {
7912 return -1;
7913 }
7914 /* level 3 */
7915 i = map->level23[16*map->count2 + 128*i + l3];
7916 if (i == 0) {
7917 return -1;
7918 }
7919 return i;
7920}
7921
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007922/* Lookup the character ch in the mapping. If the character
7923 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007924 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007925static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007926charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927{
Christian Heimes217cfd12007-12-02 14:31:20 +00007928 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007929 PyObject *x;
7930
7931 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 x = PyObject_GetItem(mapping, w);
7934 Py_DECREF(w);
7935 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7937 /* No mapping found means: mapping is undefined. */
7938 PyErr_Clear();
7939 x = Py_None;
7940 Py_INCREF(x);
7941 return x;
7942 } else
7943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007945 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007947 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 long value = PyLong_AS_LONG(x);
7949 if (value < 0 || value > 255) {
7950 PyErr_SetString(PyExc_TypeError,
7951 "character mapping must be in range(256)");
7952 Py_DECREF(x);
7953 return NULL;
7954 }
7955 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007957 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 /* wrong return value */
7961 PyErr_Format(PyExc_TypeError,
7962 "character mapping must return integer, bytes or None, not %.400s",
7963 x->ob_type->tp_name);
7964 Py_DECREF(x);
7965 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 }
7967}
7968
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007970charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007971{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007972 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7973 /* exponentially overallocate to minimize reallocations */
7974 if (requiredsize < 2*outsize)
7975 requiredsize = 2*outsize;
7976 if (_PyBytes_Resize(outobj, requiredsize))
7977 return -1;
7978 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007979}
7980
Benjamin Peterson14339b62009-01-31 16:36:08 +00007981typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007983} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007985 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 space is available. Return a new reference to the object that
7987 was put in the output buffer, or Py_None, if the mapping was undefined
7988 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007989 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007990static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007991charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994 PyObject *rep;
7995 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007996 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997
Christian Heimes90aa7642007-12-19 02:45:37 +00007998 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008001 if (res == -1)
8002 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 if (outsize<requiredsize)
8004 if (charmapencode_resize(outobj, outpos, requiredsize))
8005 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008006 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 outstart[(*outpos)++] = (char)res;
8008 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 }
8010
8011 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 Py_DECREF(rep);
8016 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 if (PyLong_Check(rep)) {
8019 Py_ssize_t requiredsize = *outpos+1;
8020 if (outsize<requiredsize)
8021 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8022 Py_DECREF(rep);
8023 return enc_EXCEPTION;
8024 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008025 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 else {
8029 const char *repchars = PyBytes_AS_STRING(rep);
8030 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8031 Py_ssize_t requiredsize = *outpos+repsize;
8032 if (outsize<requiredsize)
8033 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8034 Py_DECREF(rep);
8035 return enc_EXCEPTION;
8036 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008037 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 memcpy(outstart + *outpos, repchars, repsize);
8039 *outpos += repsize;
8040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 Py_DECREF(rep);
8043 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044}
8045
8046/* handle an error in PyUnicode_EncodeCharmap
8047 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008048static int
8049charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008050 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008052 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008053 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008054{
8055 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008056 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008058 enum PyUnicode_Kind kind;
8059 void *data;
8060 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t collstartpos = *inpos;
8063 Py_ssize_t collendpos = *inpos+1;
8064 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065 char *encoding = "charmap";
8066 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008068 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008069 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008071 if (PyUnicode_READY(unicode) < 0)
8072 return -1;
8073 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 /* find all unencodable characters */
8075 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008077 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008078 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008079 val = encoding_map_lookup(ch, mapping);
8080 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 break;
8082 ++collendpos;
8083 continue;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008086 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8087 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 if (rep==NULL)
8089 return -1;
8090 else if (rep!=Py_None) {
8091 Py_DECREF(rep);
8092 break;
8093 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 }
8097 /* cache callback name lookup
8098 * (if not done yet, i.e. it's the first error) */
8099 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 if ((errors==NULL) || (!strcmp(errors, "strict")))
8101 *known_errorHandler = 1;
8102 else if (!strcmp(errors, "replace"))
8103 *known_errorHandler = 2;
8104 else if (!strcmp(errors, "ignore"))
8105 *known_errorHandler = 3;
8106 else if (!strcmp(errors, "xmlcharrefreplace"))
8107 *known_errorHandler = 4;
8108 else
8109 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
8111 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008112 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008113 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 return -1;
8115 case 2: /* replace */
8116 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 x = charmapencode_output('?', mapping, res, respos);
8118 if (x==enc_EXCEPTION) {
8119 return -1;
8120 }
8121 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008122 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 return -1;
8124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 }
8126 /* fall through */
8127 case 3: /* ignore */
8128 *inpos = collendpos;
8129 break;
8130 case 4: /* xmlcharrefreplace */
8131 /* generate replacement (temporarily (mis)uses p) */
8132 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 char buffer[2+29+1+1];
8134 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008135 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 for (cp = buffer; *cp; ++cp) {
8137 x = charmapencode_output(*cp, mapping, res, respos);
8138 if (x==enc_EXCEPTION)
8139 return -1;
8140 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008141 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 return -1;
8143 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008144 }
8145 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008146 *inpos = collendpos;
8147 break;
8148 default:
8149 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008150 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008153 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008154 if (PyBytes_Check(repunicode)) {
8155 /* Directly copy bytes result to output. */
8156 Py_ssize_t outsize = PyBytes_Size(*res);
8157 Py_ssize_t requiredsize;
8158 repsize = PyBytes_Size(repunicode);
8159 requiredsize = *respos + repsize;
8160 if (requiredsize > outsize)
8161 /* Make room for all additional bytes. */
8162 if (charmapencode_resize(res, respos, requiredsize)) {
8163 Py_DECREF(repunicode);
8164 return -1;
8165 }
8166 memcpy(PyBytes_AsString(*res) + *respos,
8167 PyBytes_AsString(repunicode), repsize);
8168 *respos += repsize;
8169 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008170 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008171 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008174 if (PyUnicode_READY(repunicode) < 0) {
8175 Py_DECREF(repunicode);
8176 return -1;
8177 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008178 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008179 data = PyUnicode_DATA(repunicode);
8180 kind = PyUnicode_KIND(repunicode);
8181 for (index = 0; index < repsize; index++) {
8182 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8183 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008185 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 return -1;
8187 }
8188 else if (x==enc_FAILED) {
8189 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008190 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return -1;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 }
8194 *inpos = newpos;
8195 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 }
8197 return 0;
8198}
8199
Alexander Belopolsky40018472011-02-26 01:02:56 +00008200PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008201_PyUnicode_EncodeCharmap(PyObject *unicode,
8202 PyObject *mapping,
8203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205 /* output object */
8206 PyObject *res = NULL;
8207 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008208 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008209 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008211 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 PyObject *errorHandler = NULL;
8213 PyObject *exc = NULL;
8214 /* the following variable is used for caching string comparisons
8215 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8216 * 3=ignore, 4=xmlcharrefreplace */
8217 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219 if (PyUnicode_READY(unicode) < 0)
8220 return NULL;
8221 size = PyUnicode_GET_LENGTH(unicode);
8222
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 /* Default to Latin-1 */
8224 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008225 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 /* allocate enough for a simple encoding without
8228 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008229 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230 if (res == NULL)
8231 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008232 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008236 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 if (x==enc_EXCEPTION) /* error */
8240 goto onError;
8241 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008242 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 &exc,
8244 &known_errorHandler, &errorHandler, errors,
8245 &res, &respos)) {
8246 goto onError;
8247 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 else
8250 /* done with this character => adjust input position */
8251 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008255 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008256 if (_PyBytes_Resize(&res, respos) < 0)
8257 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 Py_XDECREF(exc);
8260 Py_XDECREF(errorHandler);
8261 return res;
8262
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 Py_XDECREF(res);
8265 Py_XDECREF(exc);
8266 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 return NULL;
8268}
8269
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008270/* Deprecated */
8271PyObject *
8272PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8273 Py_ssize_t size,
8274 PyObject *mapping,
8275 const char *errors)
8276{
8277 PyObject *result;
8278 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8279 if (unicode == NULL)
8280 return NULL;
8281 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8282 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008283 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284}
8285
Alexander Belopolsky40018472011-02-26 01:02:56 +00008286PyObject *
8287PyUnicode_AsCharmapString(PyObject *unicode,
8288 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
8290 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 PyErr_BadArgument();
8292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295}
8296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008298static void
8299make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008301 Py_ssize_t startpos, Py_ssize_t endpos,
8302 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 *exceptionObject = _PyUnicodeTranslateError_Create(
8306 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
8308 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8310 goto onError;
8311 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8312 goto onError;
8313 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8314 goto onError;
8315 return;
8316 onError:
8317 Py_DECREF(*exceptionObject);
8318 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
8320}
8321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008323static void
8324raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326 Py_ssize_t startpos, Py_ssize_t endpos,
8327 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328{
8329 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334
8335/* error handling callback helper:
8336 build arguments, call the callback and check the arguments,
8337 put the result into newpos and return the replacement string, which
8338 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339static PyObject *
8340unicode_translate_call_errorhandler(const char *errors,
8341 PyObject **errorHandler,
8342 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008344 Py_ssize_t startpos, Py_ssize_t endpos,
8345 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008346{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008347 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008349 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 PyObject *restuple;
8351 PyObject *resunicode;
8352
8353 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 }
8358
8359 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363
8364 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008369 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 Py_DECREF(restuple);
8371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 }
8373 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 &resunicode, &i_newpos)) {
8375 Py_DECREF(restuple);
8376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008380 else
8381 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8384 Py_DECREF(restuple);
8385 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008386 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 Py_INCREF(resunicode);
8388 Py_DECREF(restuple);
8389 return resunicode;
8390}
8391
8392/* Lookup the character ch in the mapping and put the result in result,
8393 which must be decrefed by the caller.
8394 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397{
Christian Heimes217cfd12007-12-02 14:31:20 +00008398 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 PyObject *x;
8400
8401 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 x = PyObject_GetItem(mapping, w);
8404 Py_DECREF(w);
8405 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8407 /* No mapping found means: use 1:1 mapping. */
8408 PyErr_Clear();
8409 *result = NULL;
8410 return 0;
8411 } else
8412 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 }
8414 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 *result = x;
8416 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008418 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 long value = PyLong_AS_LONG(x);
8420 long max = PyUnicode_GetMax();
8421 if (value < 0 || value > max) {
8422 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008423 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 Py_DECREF(x);
8425 return -1;
8426 }
8427 *result = x;
8428 return 0;
8429 }
8430 else if (PyUnicode_Check(x)) {
8431 *result = x;
8432 return 0;
8433 }
8434 else {
8435 /* wrong return value */
8436 PyErr_SetString(PyExc_TypeError,
8437 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 Py_DECREF(x);
8439 return -1;
8440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441}
8442/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 if not reallocate and adjust various state variables.
8444 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008445static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008450 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 /* exponentially overallocate to minimize reallocations */
8452 if (requiredsize < 2 * oldsize)
8453 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8455 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 }
8459 return 0;
8460}
8461/* lookup the character, put the result in the output string and adjust
8462 various state variables. Return a new reference to the object that
8463 was put in the output buffer in *result, or Py_None, if the mapping was
8464 undefined (in which case no character was written).
8465 The called must decref result.
8466 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8469 PyObject *mapping, Py_UCS4 **output,
8470 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008471 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8474 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 }
8480 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008482 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 }
8486 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 Py_ssize_t repsize;
8488 if (PyUnicode_READY(*res) == -1)
8489 return -1;
8490 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 if (repsize==1) {
8492 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 }
8495 else if (repsize!=0) {
8496 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 Py_ssize_t requiredsize = *opos +
8498 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 Py_ssize_t i;
8501 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 for(i = 0; i < repsize; i++)
8504 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 }
8507 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008509 return 0;
8510}
8511
Alexander Belopolsky40018472011-02-26 01:02:56 +00008512PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513_PyUnicode_TranslateCharmap(PyObject *input,
8514 PyObject *mapping,
8515 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 /* input object */
8518 char *idata;
8519 Py_ssize_t size, i;
8520 int kind;
8521 /* output buffer */
8522 Py_UCS4 *output = NULL;
8523 Py_ssize_t osize;
8524 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 char *reason = "character maps to <undefined>";
8528 PyObject *errorHandler = NULL;
8529 PyObject *exc = NULL;
8530 /* the following variable is used for caching string comparisons
8531 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8532 * 3=ignore, 4=xmlcharrefreplace */
8533 int known_errorHandler = -1;
8534
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 PyErr_BadArgument();
8537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 if (PyUnicode_READY(input) == -1)
8541 return NULL;
8542 idata = (char*)PyUnicode_DATA(input);
8543 kind = PyUnicode_KIND(input);
8544 size = PyUnicode_GET_LENGTH(input);
8545 i = 0;
8546
8547 if (size == 0) {
8548 Py_INCREF(input);
8549 return input;
8550 }
8551
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 /* allocate enough for a simple 1:1 translation without
8553 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 osize = size;
8555 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8556 opos = 0;
8557 if (output == NULL) {
8558 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 /* try to encode it */
8564 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 if (charmaptranslate_output(input, i, mapping,
8566 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 Py_XDECREF(x);
8568 goto onError;
8569 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 else { /* untranslatable character */
8574 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8575 Py_ssize_t repsize;
8576 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 Py_ssize_t collstart = i;
8580 Py_ssize_t collend = i+1;
8581 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 while (collend < size) {
8585 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 goto onError;
8587 Py_XDECREF(x);
8588 if (x!=Py_None)
8589 break;
8590 ++collend;
8591 }
8592 /* cache callback name lookup
8593 * (if not done yet, i.e. it's the first error) */
8594 if (known_errorHandler==-1) {
8595 if ((errors==NULL) || (!strcmp(errors, "strict")))
8596 known_errorHandler = 1;
8597 else if (!strcmp(errors, "replace"))
8598 known_errorHandler = 2;
8599 else if (!strcmp(errors, "ignore"))
8600 known_errorHandler = 3;
8601 else if (!strcmp(errors, "xmlcharrefreplace"))
8602 known_errorHandler = 4;
8603 else
8604 known_errorHandler = 0;
8605 }
8606 switch (known_errorHandler) {
8607 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 raise_translate_exception(&exc, input, collstart,
8609 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 case 2: /* replace */
8612 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 for (coll = collstart; coll<collend; coll++)
8614 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 /* fall through */
8616 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 break;
8619 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 /* generate replacement (temporarily (mis)uses i) */
8621 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 char buffer[2+29+1+1];
8623 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8625 if (charmaptranslate_makespace(&output, &osize,
8626 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 goto onError;
8628 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 break;
8633 default:
8634 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 reason, input, &exc,
8636 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008637 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008639 if (PyUnicode_READY(repunicode) < 0) {
8640 Py_DECREF(repunicode);
8641 goto onError;
8642 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 repsize = PyUnicode_GET_LENGTH(repunicode);
8645 if (charmaptranslate_makespace(&output, &osize,
8646 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 Py_DECREF(repunicode);
8648 goto onError;
8649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 for (uni2 = 0; repsize-->0; ++uni2)
8651 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8652 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008655 }
8656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8658 if (!res)
8659 goto onError;
8660 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 Py_XDECREF(exc);
8662 Py_XDECREF(errorHandler);
8663 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667 Py_XDECREF(exc);
8668 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 return NULL;
8670}
8671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672/* Deprecated. Use PyUnicode_Translate instead. */
8673PyObject *
8674PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8675 Py_ssize_t size,
8676 PyObject *mapping,
8677 const char *errors)
8678{
8679 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8680 if (!unicode)
8681 return NULL;
8682 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8683}
8684
Alexander Belopolsky40018472011-02-26 01:02:56 +00008685PyObject *
8686PyUnicode_Translate(PyObject *str,
8687 PyObject *mapping,
8688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689{
8690 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008691
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 str = PyUnicode_FromObject(str);
8693 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 Py_DECREF(str);
8697 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008698
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 Py_XDECREF(str);
8701 return NULL;
8702}
Tim Petersced69f82003-09-16 20:30:58 +00008703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008705fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706{
8707 /* No need to call PyUnicode_READY(self) because this function is only
8708 called as a callback from fixup() which does it already. */
8709 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8710 const int kind = PyUnicode_KIND(self);
8711 void *data = PyUnicode_DATA(self);
8712 Py_UCS4 maxchar = 0, ch, fixed;
8713 Py_ssize_t i;
8714
8715 for (i = 0; i < len; ++i) {
8716 ch = PyUnicode_READ(kind, data, i);
8717 fixed = 0;
8718 if (ch > 127) {
8719 if (Py_UNICODE_ISSPACE(ch))
8720 fixed = ' ';
8721 else {
8722 const int decimal = Py_UNICODE_TODECIMAL(ch);
8723 if (decimal >= 0)
8724 fixed = '0' + decimal;
8725 }
8726 if (fixed != 0) {
8727 if (fixed > maxchar)
8728 maxchar = fixed;
8729 PyUnicode_WRITE(kind, data, i, fixed);
8730 }
8731 else if (ch > maxchar)
8732 maxchar = ch;
8733 }
8734 else if (ch > maxchar)
8735 maxchar = ch;
8736 }
8737
8738 return maxchar;
8739}
8740
8741PyObject *
8742_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8743{
8744 if (!PyUnicode_Check(unicode)) {
8745 PyErr_BadInternalCall();
8746 return NULL;
8747 }
8748 if (PyUnicode_READY(unicode) == -1)
8749 return NULL;
8750 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8751 /* If the string is already ASCII, just return the same string */
8752 Py_INCREF(unicode);
8753 return unicode;
8754 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008755 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756}
8757
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008758PyObject *
8759PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8760 Py_ssize_t length)
8761{
Victor Stinnerf0124502011-11-21 23:12:56 +01008762 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008763 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008764 Py_UCS4 maxchar;
8765 enum PyUnicode_Kind kind;
8766 void *data;
8767
8768 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008769 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008770 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008771 if (ch > 127) {
8772 int decimal = Py_UNICODE_TODECIMAL(ch);
8773 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008774 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008775 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008776 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008777 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008778
8779 /* Copy to a new string */
8780 decimal = PyUnicode_New(length, maxchar);
8781 if (decimal == NULL)
8782 return decimal;
8783 kind = PyUnicode_KIND(decimal);
8784 data = PyUnicode_DATA(decimal);
8785 /* Iterate over code points */
8786 for (i = 0; i < length; i++) {
8787 Py_UNICODE ch = s[i];
8788 if (ch > 127) {
8789 int decimal = Py_UNICODE_TODECIMAL(ch);
8790 if (decimal >= 0)
8791 ch = '0' + decimal;
8792 }
8793 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008795 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008796}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008797/* --- Decimal Encoder ---------------------------------------------------- */
8798
Alexander Belopolsky40018472011-02-26 01:02:56 +00008799int
8800PyUnicode_EncodeDecimal(Py_UNICODE *s,
8801 Py_ssize_t length,
8802 char *output,
8803 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008804{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008805 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008806 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008807 enum PyUnicode_Kind kind;
8808 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008809
8810 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 PyErr_BadArgument();
8812 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008813 }
8814
Victor Stinner42bf7752011-11-21 22:52:58 +01008815 unicode = PyUnicode_FromUnicode(s, length);
8816 if (unicode == NULL)
8817 return -1;
8818
Victor Stinner6345be92011-11-25 20:09:01 +01008819 if (PyUnicode_READY(unicode) < 0) {
8820 Py_DECREF(unicode);
8821 return -1;
8822 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008823 kind = PyUnicode_KIND(unicode);
8824 data = PyUnicode_DATA(unicode);
8825
Victor Stinnerb84d7232011-11-22 01:50:07 +01008826 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008827 PyObject *exc;
8828 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008830 Py_ssize_t startpos;
8831
8832 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008833
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008836 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008838 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 decimal = Py_UNICODE_TODECIMAL(ch);
8840 if (decimal >= 0) {
8841 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008842 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008843 continue;
8844 }
8845 if (0 < ch && ch < 256) {
8846 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008847 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 continue;
8849 }
Victor Stinner6345be92011-11-25 20:09:01 +01008850
Victor Stinner42bf7752011-11-21 22:52:58 +01008851 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008852 exc = NULL;
8853 raise_encode_exception(&exc, "decimal", unicode,
8854 startpos, startpos+1,
8855 "invalid decimal Unicode string");
8856 Py_XDECREF(exc);
8857 Py_DECREF(unicode);
8858 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008859 }
8860 /* 0-terminate the output string */
8861 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008862 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008863 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864}
8865
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866/* --- Helpers ------------------------------------------------------------ */
8867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008869any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 Py_ssize_t start,
8871 Py_ssize_t end)
8872{
8873 int kind1, kind2, kind;
8874 void *buf1, *buf2;
8875 Py_ssize_t len1, len2, result;
8876
8877 kind1 = PyUnicode_KIND(s1);
8878 kind2 = PyUnicode_KIND(s2);
8879 kind = kind1 > kind2 ? kind1 : kind2;
8880 buf1 = PyUnicode_DATA(s1);
8881 buf2 = PyUnicode_DATA(s2);
8882 if (kind1 != kind)
8883 buf1 = _PyUnicode_AsKind(s1, kind);
8884 if (!buf1)
8885 return -2;
8886 if (kind2 != kind)
8887 buf2 = _PyUnicode_AsKind(s2, kind);
8888 if (!buf2) {
8889 if (kind1 != kind) PyMem_Free(buf1);
8890 return -2;
8891 }
8892 len1 = PyUnicode_GET_LENGTH(s1);
8893 len2 = PyUnicode_GET_LENGTH(s2);
8894
Victor Stinner794d5672011-10-10 03:21:36 +02008895 if (direction > 0) {
8896 switch(kind) {
8897 case PyUnicode_1BYTE_KIND:
8898 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8899 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8900 else
8901 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8902 break;
8903 case PyUnicode_2BYTE_KIND:
8904 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8905 break;
8906 case PyUnicode_4BYTE_KIND:
8907 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8908 break;
8909 default:
8910 assert(0); result = -2;
8911 }
8912 }
8913 else {
8914 switch(kind) {
8915 case PyUnicode_1BYTE_KIND:
8916 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8917 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8918 else
8919 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8920 break;
8921 case PyUnicode_2BYTE_KIND:
8922 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8923 break;
8924 case PyUnicode_4BYTE_KIND:
8925 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8926 break;
8927 default:
8928 assert(0); result = -2;
8929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 }
8931
8932 if (kind1 != kind)
8933 PyMem_Free(buf1);
8934 if (kind2 != kind)
8935 PyMem_Free(buf2);
8936
8937 return result;
8938}
8939
8940Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008941_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 Py_ssize_t n_buffer,
8943 void *digits, Py_ssize_t n_digits,
8944 Py_ssize_t min_width,
8945 const char *grouping,
8946 const char *thousands_sep)
8947{
8948 switch(kind) {
8949 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008950 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8951 return _PyUnicode_ascii_InsertThousandsGrouping(
8952 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8953 min_width, grouping, thousands_sep);
8954 else
8955 return _PyUnicode_ucs1_InsertThousandsGrouping(
8956 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8957 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 case PyUnicode_2BYTE_KIND:
8959 return _PyUnicode_ucs2_InsertThousandsGrouping(
8960 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8961 min_width, grouping, thousands_sep);
8962 case PyUnicode_4BYTE_KIND:
8963 return _PyUnicode_ucs4_InsertThousandsGrouping(
8964 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8965 min_width, grouping, thousands_sep);
8966 }
8967 assert(0);
8968 return -1;
8969}
8970
8971
Thomas Wouters477c8d52006-05-27 19:21:47 +00008972/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008973#define ADJUST_INDICES(start, end, len) \
8974 if (end > len) \
8975 end = len; \
8976 else if (end < 0) { \
8977 end += len; \
8978 if (end < 0) \
8979 end = 0; \
8980 } \
8981 if (start < 0) { \
8982 start += len; \
8983 if (start < 0) \
8984 start = 0; \
8985 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008986
Alexander Belopolsky40018472011-02-26 01:02:56 +00008987Py_ssize_t
8988PyUnicode_Count(PyObject *str,
8989 PyObject *substr,
8990 Py_ssize_t start,
8991 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008993 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008994 PyObject* str_obj;
8995 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 int kind1, kind2, kind;
8997 void *buf1 = NULL, *buf2 = NULL;
8998 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008999
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009000 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009003 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009004 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 Py_DECREF(str_obj);
9006 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 }
Tim Petersced69f82003-09-16 20:30:58 +00009008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 kind1 = PyUnicode_KIND(str_obj);
9010 kind2 = PyUnicode_KIND(sub_obj);
9011 kind = kind1 > kind2 ? kind1 : kind2;
9012 buf1 = PyUnicode_DATA(str_obj);
9013 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009014 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 if (!buf1)
9016 goto onError;
9017 buf2 = PyUnicode_DATA(sub_obj);
9018 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009019 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 if (!buf2)
9021 goto onError;
9022 len1 = PyUnicode_GET_LENGTH(str_obj);
9023 len2 = PyUnicode_GET_LENGTH(sub_obj);
9024
9025 ADJUST_INDICES(start, end, len1);
9026 switch(kind) {
9027 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009028 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9029 result = asciilib_count(
9030 ((Py_UCS1*)buf1) + start, end - start,
9031 buf2, len2, PY_SSIZE_T_MAX
9032 );
9033 else
9034 result = ucs1lib_count(
9035 ((Py_UCS1*)buf1) + start, end - start,
9036 buf2, len2, PY_SSIZE_T_MAX
9037 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 break;
9039 case PyUnicode_2BYTE_KIND:
9040 result = ucs2lib_count(
9041 ((Py_UCS2*)buf1) + start, end - start,
9042 buf2, len2, PY_SSIZE_T_MAX
9043 );
9044 break;
9045 case PyUnicode_4BYTE_KIND:
9046 result = ucs4lib_count(
9047 ((Py_UCS4*)buf1) + start, end - start,
9048 buf2, len2, PY_SSIZE_T_MAX
9049 );
9050 break;
9051 default:
9052 assert(0); result = 0;
9053 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009054
9055 Py_DECREF(sub_obj);
9056 Py_DECREF(str_obj);
9057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 if (kind1 != kind)
9059 PyMem_Free(buf1);
9060 if (kind2 != kind)
9061 PyMem_Free(buf2);
9062
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 onError:
9065 Py_DECREF(sub_obj);
9066 Py_DECREF(str_obj);
9067 if (kind1 != kind && buf1)
9068 PyMem_Free(buf1);
9069 if (kind2 != kind && buf2)
9070 PyMem_Free(buf2);
9071 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
Alexander Belopolsky40018472011-02-26 01:02:56 +00009074Py_ssize_t
9075PyUnicode_Find(PyObject *str,
9076 PyObject *sub,
9077 Py_ssize_t start,
9078 Py_ssize_t end,
9079 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009081 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009082
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009086 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 Py_DECREF(str);
9089 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090 }
Tim Petersced69f82003-09-16 20:30:58 +00009091
Victor Stinner794d5672011-10-10 03:21:36 +02009092 result = any_find_slice(direction,
9093 str, sub, start, end
9094 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009095
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009097 Py_DECREF(sub);
9098
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099 return result;
9100}
9101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102Py_ssize_t
9103PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9104 Py_ssize_t start, Py_ssize_t end,
9105 int direction)
9106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009108 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 if (PyUnicode_READY(str) == -1)
9110 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009111 if (start < 0 || end < 0) {
9112 PyErr_SetString(PyExc_IndexError, "string index out of range");
9113 return -2;
9114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 if (end > PyUnicode_GET_LENGTH(str))
9116 end = PyUnicode_GET_LENGTH(str);
9117 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009118 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9119 kind, end-start, ch, direction);
9120 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009122 else
9123 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124}
9125
Alexander Belopolsky40018472011-02-26 01:02:56 +00009126static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009127tailmatch(PyObject *self,
9128 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129 Py_ssize_t start,
9130 Py_ssize_t end,
9131 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 int kind_self;
9134 int kind_sub;
9135 void *data_self;
9136 void *data_sub;
9137 Py_ssize_t offset;
9138 Py_ssize_t i;
9139 Py_ssize_t end_sub;
9140
9141 if (PyUnicode_READY(self) == -1 ||
9142 PyUnicode_READY(substring) == -1)
9143 return 0;
9144
9145 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 return 1;
9147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9149 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 kind_self = PyUnicode_KIND(self);
9154 data_self = PyUnicode_DATA(self);
9155 kind_sub = PyUnicode_KIND(substring);
9156 data_sub = PyUnicode_DATA(substring);
9157 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9158
9159 if (direction > 0)
9160 offset = end;
9161 else
9162 offset = start;
9163
9164 if (PyUnicode_READ(kind_self, data_self, offset) ==
9165 PyUnicode_READ(kind_sub, data_sub, 0) &&
9166 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9167 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9168 /* If both are of the same kind, memcmp is sufficient */
9169 if (kind_self == kind_sub) {
9170 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009171 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 data_sub,
9173 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009174 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 }
9176 /* otherwise we have to compare each character by first accesing it */
9177 else {
9178 /* We do not need to compare 0 and len(substring)-1 because
9179 the if statement above ensured already that they are equal
9180 when we end up here. */
9181 // TODO: honor direction and do a forward or backwards search
9182 for (i = 1; i < end_sub; ++i) {
9183 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9184 PyUnicode_READ(kind_sub, data_sub, i))
9185 return 0;
9186 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189 }
9190
9191 return 0;
9192}
9193
Alexander Belopolsky40018472011-02-26 01:02:56 +00009194Py_ssize_t
9195PyUnicode_Tailmatch(PyObject *str,
9196 PyObject *substr,
9197 Py_ssize_t start,
9198 Py_ssize_t end,
9199 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009201 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009202
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203 str = PyUnicode_FromObject(str);
9204 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206 substr = PyUnicode_FromObject(substr);
9207 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 Py_DECREF(str);
9209 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 }
Tim Petersced69f82003-09-16 20:30:58 +00009211
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009212 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 Py_DECREF(str);
9215 Py_DECREF(substr);
9216 return result;
9217}
9218
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219/* Apply fixfct filter to the Unicode object self and return a
9220 reference to the modified object */
9221
Alexander Belopolsky40018472011-02-26 01:02:56 +00009222static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009223fixup(PyObject *self,
9224 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 PyObject *u;
9227 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009228 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229
Victor Stinner87af4f22011-11-21 23:03:47 +01009230 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009233 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 /* fix functions return the new maximum character in a string,
9236 if the kind of the resulting unicode object does not change,
9237 everything is fine. Otherwise we need to change the string kind
9238 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009239 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009240
9241 if (maxchar_new == 0) {
9242 /* no changes */;
9243 if (PyUnicode_CheckExact(self)) {
9244 Py_DECREF(u);
9245 Py_INCREF(self);
9246 return self;
9247 }
9248 else
9249 return u;
9250 }
9251
9252 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 maxchar_new = 127;
9254 else if (maxchar_new <= 255)
9255 maxchar_new = 255;
9256 else if (maxchar_new <= 65535)
9257 maxchar_new = 65535;
9258 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009259 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009260
Victor Stinnereaab6042011-12-11 22:22:39 +01009261 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009263
9264 /* In case the maximum character changed, we need to
9265 convert the string to the new category. */
9266 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9267 if (v == NULL) {
9268 Py_DECREF(u);
9269 return NULL;
9270 }
9271 if (maxchar_new > maxchar_old) {
9272 /* If the maxchar increased so that the kind changed, not all
9273 characters are representable anymore and we need to fix the
9274 string again. This only happens in very few cases. */
9275 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9276 maxchar_old = fixfct(v);
9277 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 }
9279 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009280 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009282 Py_DECREF(u);
9283 assert(_PyUnicode_CheckConsistency(v, 1));
9284 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285}
9286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009288fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 /* No need to call PyUnicode_READY(self) because this function is only
9291 called as a callback from fixup() which does it already. */
9292 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9293 const int kind = PyUnicode_KIND(self);
9294 void *data = PyUnicode_DATA(self);
9295 int touched = 0;
9296 Py_UCS4 maxchar = 0;
9297 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009298
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 for (i = 0; i < len; ++i) {
9300 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9301 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9302 if (up != ch) {
9303 if (up > maxchar)
9304 maxchar = up;
9305 PyUnicode_WRITE(kind, data, i, up);
9306 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 else if (ch > maxchar)
9309 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009310 }
9311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 if (touched)
9313 return maxchar;
9314 else
9315 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316}
9317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009319fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9322 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9323 const int kind = PyUnicode_KIND(self);
9324 void *data = PyUnicode_DATA(self);
9325 int touched = 0;
9326 Py_UCS4 maxchar = 0;
9327 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 for(i = 0; i < len; ++i) {
9330 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9331 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9332 if (lo != ch) {
9333 if (lo > maxchar)
9334 maxchar = lo;
9335 PyUnicode_WRITE(kind, data, i, lo);
9336 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 else if (ch > maxchar)
9339 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
9341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 if (touched)
9343 return maxchar;
9344 else
9345 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346}
9347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009349fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9352 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9353 const int kind = PyUnicode_KIND(self);
9354 void *data = PyUnicode_DATA(self);
9355 int touched = 0;
9356 Py_UCS4 maxchar = 0;
9357 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 for(i = 0; i < len; ++i) {
9360 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9361 Py_UCS4 nu = 0;
9362
9363 if (Py_UNICODE_ISUPPER(ch))
9364 nu = Py_UNICODE_TOLOWER(ch);
9365 else if (Py_UNICODE_ISLOWER(ch))
9366 nu = Py_UNICODE_TOUPPER(ch);
9367
9368 if (nu != 0) {
9369 if (nu > maxchar)
9370 maxchar = nu;
9371 PyUnicode_WRITE(kind, data, i, nu);
9372 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 else if (ch > maxchar)
9375 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 }
9377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 if (touched)
9379 return maxchar;
9380 else
9381 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382}
9383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009385fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9388 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9389 const int kind = PyUnicode_KIND(self);
9390 void *data = PyUnicode_DATA(self);
9391 int touched = 0;
9392 Py_UCS4 maxchar = 0;
9393 Py_ssize_t i = 0;
9394 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009395
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009396 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398
9399 ch = PyUnicode_READ(kind, data, i);
9400 if (!Py_UNICODE_ISUPPER(ch)) {
9401 maxchar = Py_UNICODE_TOUPPER(ch);
9402 PyUnicode_WRITE(kind, data, i, maxchar);
9403 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 ++i;
9406 for(; i < len; ++i) {
9407 ch = PyUnicode_READ(kind, data, i);
9408 if (!Py_UNICODE_ISLOWER(ch)) {
9409 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9410 if (lo > maxchar)
9411 maxchar = lo;
9412 PyUnicode_WRITE(kind, data, i, lo);
9413 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 else if (ch > maxchar)
9416 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
9419 if (touched)
9420 return maxchar;
9421 else
9422 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423}
9424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009426fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9429 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9430 const int kind = PyUnicode_KIND(self);
9431 void *data = PyUnicode_DATA(self);
9432 Py_UCS4 maxchar = 0;
9433 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 int previous_is_cased;
9435
9436 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 if (len == 1) {
9438 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9439 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9440 if (ti != ch) {
9441 PyUnicode_WRITE(kind, data, i, ti);
9442 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 }
9444 else
9445 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 for(; i < len; ++i) {
9449 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9450 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009451
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009454 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 nu = Py_UNICODE_TOTITLE(ch);
9456
9457 if (nu > maxchar)
9458 maxchar = nu;
9459 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009460
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 if (Py_UNICODE_ISLOWER(ch) ||
9462 Py_UNICODE_ISUPPER(ch) ||
9463 Py_UNICODE_ISTITLE(ch))
9464 previous_is_cased = 1;
9465 else
9466 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469}
9470
Tim Peters8ce9f162004-08-27 01:49:32 +00009471PyObject *
9472PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009475 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009478 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9479 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009480 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009482 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009484 int use_memcpy;
9485 unsigned char *res_data = NULL, *sep_data = NULL;
9486 PyObject *last_obj;
9487 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488
Tim Peters05eba1f2004-08-27 21:32:02 +00009489 fseq = PySequence_Fast(seq, "");
9490 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009491 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009492 }
9493
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009494 /* NOTE: the following code can't call back into Python code,
9495 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009496 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009497
Tim Peters05eba1f2004-08-27 21:32:02 +00009498 seqlen = PySequence_Fast_GET_SIZE(fseq);
9499 /* If empty sequence, return u"". */
9500 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009501 Py_DECREF(fseq);
9502 Py_INCREF(unicode_empty);
9503 res = unicode_empty;
9504 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009505 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009506
Tim Peters05eba1f2004-08-27 21:32:02 +00009507 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009508 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009509 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009510 if (seqlen == 1) {
9511 if (PyUnicode_CheckExact(items[0])) {
9512 res = items[0];
9513 Py_INCREF(res);
9514 Py_DECREF(fseq);
9515 return res;
9516 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009517 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009518 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009519 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009520 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009521 /* Set up sep and seplen */
9522 if (separator == NULL) {
9523 /* fall back to a blank space separator */
9524 sep = PyUnicode_FromOrdinal(' ');
9525 if (!sep)
9526 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009527 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009528 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009529 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009530 else {
9531 if (!PyUnicode_Check(separator)) {
9532 PyErr_Format(PyExc_TypeError,
9533 "separator: expected str instance,"
9534 " %.80s found",
9535 Py_TYPE(separator)->tp_name);
9536 goto onError;
9537 }
9538 if (PyUnicode_READY(separator))
9539 goto onError;
9540 sep = separator;
9541 seplen = PyUnicode_GET_LENGTH(separator);
9542 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9543 /* inc refcount to keep this code path symmetric with the
9544 above case of a blank separator */
9545 Py_INCREF(sep);
9546 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009547 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009548 }
9549
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009550 /* There are at least two things to join, or else we have a subclass
9551 * of str in the sequence.
9552 * Do a pre-pass to figure out the total amount of space we'll
9553 * need (sz), and see whether all argument are strings.
9554 */
9555 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009556#ifdef Py_DEBUG
9557 use_memcpy = 0;
9558#else
9559 use_memcpy = 1;
9560#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009561 for (i = 0; i < seqlen; i++) {
9562 const Py_ssize_t old_sz = sz;
9563 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 if (!PyUnicode_Check(item)) {
9565 PyErr_Format(PyExc_TypeError,
9566 "sequence item %zd: expected str instance,"
9567 " %.80s found",
9568 i, Py_TYPE(item)->tp_name);
9569 goto onError;
9570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 if (PyUnicode_READY(item) == -1)
9572 goto onError;
9573 sz += PyUnicode_GET_LENGTH(item);
9574 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009575 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009576 if (i != 0)
9577 sz += seplen;
9578 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9579 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 goto onError;
9582 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009583 if (use_memcpy && last_obj != NULL) {
9584 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9585 use_memcpy = 0;
9586 }
9587 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009588 }
Tim Petersced69f82003-09-16 20:30:58 +00009589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009591 if (res == NULL)
9592 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009593
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009594 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009595#ifdef Py_DEBUG
9596 use_memcpy = 0;
9597#else
9598 if (use_memcpy) {
9599 res_data = PyUnicode_1BYTE_DATA(res);
9600 kind = PyUnicode_KIND(res);
9601 if (seplen != 0)
9602 sep_data = PyUnicode_1BYTE_DATA(sep);
9603 }
9604#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009606 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009607 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009609 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009610 if (use_memcpy) {
9611 Py_MEMCPY(res_data,
9612 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009613 kind * seplen);
9614 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 }
9616 else {
9617 copy_characters(res, res_offset, sep, 0, seplen);
9618 res_offset += seplen;
9619 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009621 itemlen = PyUnicode_GET_LENGTH(item);
9622 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009623 if (use_memcpy) {
9624 Py_MEMCPY(res_data,
9625 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009626 kind * itemlen);
9627 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 }
9629 else {
9630 copy_characters(res, res_offset, item, 0, itemlen);
9631 res_offset += itemlen;
9632 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009633 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009635 if (use_memcpy)
9636 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009637 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009638 else
9639 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009640
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009643 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009649 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 return NULL;
9651}
9652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653#define FILL(kind, data, value, start, length) \
9654 do { \
9655 Py_ssize_t i_ = 0; \
9656 assert(kind != PyUnicode_WCHAR_KIND); \
9657 switch ((kind)) { \
9658 case PyUnicode_1BYTE_KIND: { \
9659 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9660 memset(to_, (unsigned char)value, length); \
9661 break; \
9662 } \
9663 case PyUnicode_2BYTE_KIND: { \
9664 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9665 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9666 break; \
9667 } \
9668 default: { \
9669 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9670 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9671 break; \
9672 } \
9673 } \
9674 } while (0)
9675
Victor Stinner9310abb2011-10-05 00:59:23 +02009676static PyObject *
9677pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009678 Py_ssize_t left,
9679 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 PyObject *u;
9683 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009684 int kind;
9685 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
9687 if (left < 0)
9688 left = 0;
9689 if (right < 0)
9690 right = 0;
9691
Victor Stinnerc4b49542011-12-11 22:44:26 +01009692 if (left == 0 && right == 0)
9693 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9696 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009697 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9698 return NULL;
9699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9701 if (fill > maxchar)
9702 maxchar = fill;
9703 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009704 if (!u)
9705 return NULL;
9706
9707 kind = PyUnicode_KIND(u);
9708 data = PyUnicode_DATA(u);
9709 if (left)
9710 FILL(kind, data, fill, 0, left);
9711 if (right)
9712 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009713 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009714 assert(_PyUnicode_CheckConsistency(u, 1));
9715 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718
Alexander Belopolsky40018472011-02-26 01:02:56 +00009719PyObject *
9720PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723
9724 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 switch(PyUnicode_KIND(string)) {
9729 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009730 if (PyUnicode_IS_ASCII(string))
9731 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009732 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009733 PyUnicode_GET_LENGTH(string), keepends);
9734 else
9735 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009736 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009737 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 break;
9739 case PyUnicode_2BYTE_KIND:
9740 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009741 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 PyUnicode_GET_LENGTH(string), keepends);
9743 break;
9744 case PyUnicode_4BYTE_KIND:
9745 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009746 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 PyUnicode_GET_LENGTH(string), keepends);
9748 break;
9749 default:
9750 assert(0);
9751 list = 0;
9752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753 Py_DECREF(string);
9754 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009758split(PyObject *self,
9759 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009760 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 int kind1, kind2, kind;
9763 void *buf1, *buf2;
9764 Py_ssize_t len1, len2;
9765 PyObject* out;
9766
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009768 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 if (PyUnicode_READY(self) == -1)
9771 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 if (substring == NULL)
9774 switch(PyUnicode_KIND(self)) {
9775 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009776 if (PyUnicode_IS_ASCII(self))
9777 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(self), maxcount
9780 );
9781 else
9782 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009784 PyUnicode_GET_LENGTH(self), maxcount
9785 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 case PyUnicode_2BYTE_KIND:
9787 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 PyUnicode_GET_LENGTH(self), maxcount
9790 );
9791 case PyUnicode_4BYTE_KIND:
9792 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009793 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 PyUnicode_GET_LENGTH(self), maxcount
9795 );
9796 default:
9797 assert(0);
9798 return NULL;
9799 }
9800
9801 if (PyUnicode_READY(substring) == -1)
9802 return NULL;
9803
9804 kind1 = PyUnicode_KIND(self);
9805 kind2 = PyUnicode_KIND(substring);
9806 kind = kind1 > kind2 ? kind1 : kind2;
9807 buf1 = PyUnicode_DATA(self);
9808 buf2 = PyUnicode_DATA(substring);
9809 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009810 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 if (!buf1)
9812 return NULL;
9813 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009814 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (!buf2) {
9816 if (kind1 != kind) PyMem_Free(buf1);
9817 return NULL;
9818 }
9819 len1 = PyUnicode_GET_LENGTH(self);
9820 len2 = PyUnicode_GET_LENGTH(substring);
9821
9822 switch(kind) {
9823 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9825 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009827 else
9828 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009829 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 break;
9831 case PyUnicode_2BYTE_KIND:
9832 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 break;
9835 case PyUnicode_4BYTE_KIND:
9836 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 break;
9839 default:
9840 out = NULL;
9841 }
9842 if (kind1 != kind)
9843 PyMem_Free(buf1);
9844 if (kind2 != kind)
9845 PyMem_Free(buf2);
9846 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847}
9848
Alexander Belopolsky40018472011-02-26 01:02:56 +00009849static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009850rsplit(PyObject *self,
9851 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009852 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 int kind1, kind2, kind;
9855 void *buf1, *buf2;
9856 Py_ssize_t len1, len2;
9857 PyObject* out;
9858
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009859 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009860 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 if (PyUnicode_READY(self) == -1)
9863 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 if (substring == NULL)
9866 switch(PyUnicode_KIND(self)) {
9867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009868 if (PyUnicode_IS_ASCII(self))
9869 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009871 PyUnicode_GET_LENGTH(self), maxcount
9872 );
9873 else
9874 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009876 PyUnicode_GET_LENGTH(self), maxcount
9877 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 case PyUnicode_2BYTE_KIND:
9879 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 PyUnicode_GET_LENGTH(self), maxcount
9882 );
9883 case PyUnicode_4BYTE_KIND:
9884 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 PyUnicode_GET_LENGTH(self), maxcount
9887 );
9888 default:
9889 assert(0);
9890 return NULL;
9891 }
9892
9893 if (PyUnicode_READY(substring) == -1)
9894 return NULL;
9895
9896 kind1 = PyUnicode_KIND(self);
9897 kind2 = PyUnicode_KIND(substring);
9898 kind = kind1 > kind2 ? kind1 : kind2;
9899 buf1 = PyUnicode_DATA(self);
9900 buf2 = PyUnicode_DATA(substring);
9901 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (!buf1)
9904 return NULL;
9905 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009906 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (!buf2) {
9908 if (kind1 != kind) PyMem_Free(buf1);
9909 return NULL;
9910 }
9911 len1 = PyUnicode_GET_LENGTH(self);
9912 len2 = PyUnicode_GET_LENGTH(substring);
9913
9914 switch(kind) {
9915 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9917 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009919 else
9920 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009921 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 break;
9923 case PyUnicode_2BYTE_KIND:
9924 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 break;
9927 case PyUnicode_4BYTE_KIND:
9928 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009929 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 break;
9931 default:
9932 out = NULL;
9933 }
9934 if (kind1 != kind)
9935 PyMem_Free(buf1);
9936 if (kind2 != kind)
9937 PyMem_Free(buf2);
9938 return out;
9939}
9940
9941static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009942anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9943 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944{
9945 switch(kind) {
9946 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009947 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9948 return asciilib_find(buf1, len1, buf2, len2, offset);
9949 else
9950 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 case PyUnicode_2BYTE_KIND:
9952 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9953 case PyUnicode_4BYTE_KIND:
9954 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9955 }
9956 assert(0);
9957 return -1;
9958}
9959
9960static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9962 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963{
9964 switch(kind) {
9965 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009966 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9967 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9968 else
9969 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 case PyUnicode_2BYTE_KIND:
9971 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9972 case PyUnicode_4BYTE_KIND:
9973 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9974 }
9975 assert(0);
9976 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009977}
9978
Alexander Belopolsky40018472011-02-26 01:02:56 +00009979static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980replace(PyObject *self, PyObject *str1,
9981 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 PyObject *u;
9984 char *sbuf = PyUnicode_DATA(self);
9985 char *buf1 = PyUnicode_DATA(str1);
9986 char *buf2 = PyUnicode_DATA(str2);
9987 int srelease = 0, release1 = 0, release2 = 0;
9988 int skind = PyUnicode_KIND(self);
9989 int kind1 = PyUnicode_KIND(str1);
9990 int kind2 = PyUnicode_KIND(str2);
9991 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9992 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9993 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009994 int mayshrink;
9995 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996
9997 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010000 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010001
Victor Stinner59de0ee2011-10-07 10:01:28 +020010002 if (str1 == str2)
10003 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (skind < kind1)
10005 /* substring too wide to be present */
10006 goto nothing;
10007
Victor Stinner49a0a212011-10-12 23:46:10 +020010008 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10009 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10010 /* Replacing str1 with str2 may cause a maxchar reduction in the
10011 result string. */
10012 mayshrink = (maxchar_str2 < maxchar);
10013 maxchar = Py_MAX(maxchar, maxchar_str2);
10014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010016 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010017 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010019 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010021 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010022 Py_UCS4 u1, u2;
10023 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010025 if (findchar(sbuf, PyUnicode_KIND(self),
10026 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010027 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010030 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010032 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 rkind = PyUnicode_KIND(u);
10034 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10035 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010036 if (--maxcount < 0)
10037 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010039 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010040 }
10041 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 int rkind = skind;
10043 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 if (kind1 < rkind) {
10046 /* widen substring */
10047 buf1 = _PyUnicode_AsKind(str1, rkind);
10048 if (!buf1) goto error;
10049 release1 = 1;
10050 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010051 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010052 if (i < 0)
10053 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 if (rkind > kind2) {
10055 /* widen replacement */
10056 buf2 = _PyUnicode_AsKind(str2, rkind);
10057 if (!buf2) goto error;
10058 release2 = 1;
10059 }
10060 else if (rkind < kind2) {
10061 /* widen self and buf1 */
10062 rkind = kind2;
10063 if (release1) PyMem_Free(buf1);
10064 sbuf = _PyUnicode_AsKind(self, rkind);
10065 if (!sbuf) goto error;
10066 srelease = 1;
10067 buf1 = _PyUnicode_AsKind(str1, rkind);
10068 if (!buf1) goto error;
10069 release1 = 1;
10070 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010071 u = PyUnicode_New(slen, maxchar);
10072 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010074 assert(PyUnicode_KIND(u) == rkind);
10075 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010076
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010078 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010079 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010083
10084 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010085 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010088 if (i == -1)
10089 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010092 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010096 }
10097 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_ssize_t n, i, j, ires;
10099 Py_ssize_t product, new_size;
10100 int rkind = skind;
10101 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010104 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 buf1 = _PyUnicode_AsKind(str1, rkind);
10106 if (!buf1) goto error;
10107 release1 = 1;
10108 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010109 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010110 if (n == 0)
10111 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 buf2 = _PyUnicode_AsKind(str2, rkind);
10115 if (!buf2) goto error;
10116 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010119 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 rkind = kind2;
10121 sbuf = _PyUnicode_AsKind(self, rkind);
10122 if (!sbuf) goto error;
10123 srelease = 1;
10124 if (release1) PyMem_Free(buf1);
10125 buf1 = _PyUnicode_AsKind(str1, rkind);
10126 if (!buf1) goto error;
10127 release1 = 1;
10128 }
10129 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10130 PyUnicode_GET_LENGTH(str1))); */
10131 product = n * (len2-len1);
10132 if ((product / (len2-len1)) != n) {
10133 PyErr_SetString(PyExc_OverflowError,
10134 "replace string is too long");
10135 goto error;
10136 }
10137 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010138 if (new_size == 0) {
10139 Py_INCREF(unicode_empty);
10140 u = unicode_empty;
10141 goto done;
10142 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10144 PyErr_SetString(PyExc_OverflowError,
10145 "replace string is too long");
10146 goto error;
10147 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010148 u = PyUnicode_New(new_size, maxchar);
10149 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 assert(PyUnicode_KIND(u) == rkind);
10152 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 ires = i = 0;
10154 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010155 while (n-- > 0) {
10156 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010157 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010158 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010159 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010160 if (j == -1)
10161 break;
10162 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010164 memcpy(res + rkind * ires,
10165 sbuf + rkind * i,
10166 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 }
10169 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010171 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010173 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010177 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010180 memcpy(res + rkind * ires,
10181 sbuf + rkind * i,
10182 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010183 }
10184 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 /* interleave */
10186 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010189 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 if (--n <= 0)
10192 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 memcpy(res + rkind * ires,
10194 sbuf + rkind * i,
10195 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 ires++;
10197 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010198 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010199 memcpy(res + rkind * ires,
10200 sbuf + rkind * i,
10201 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010202 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010203 }
10204
10205 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010206 unicode_adjust_maxchar(&u);
10207 if (u == NULL)
10208 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010210
10211 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 if (srelease)
10213 PyMem_FREE(sbuf);
10214 if (release1)
10215 PyMem_FREE(buf1);
10216 if (release2)
10217 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010218 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (srelease)
10224 PyMem_FREE(sbuf);
10225 if (release1)
10226 PyMem_FREE(buf1);
10227 if (release2)
10228 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010229 return unicode_result_unchanged(self);
10230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 error:
10232 if (srelease && sbuf)
10233 PyMem_FREE(sbuf);
10234 if (release1 && buf1)
10235 PyMem_FREE(buf1);
10236 if (release2 && buf2)
10237 PyMem_FREE(buf2);
10238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239}
10240
10241/* --- Unicode Object Methods --------------------------------------------- */
10242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010243PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010244 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245\n\
10246Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010247characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
10249static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010250unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252 return fixup(self, fixtitle);
10253}
10254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010255PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257\n\
10258Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010259have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
10261static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010262unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 return fixup(self, fixcapitalize);
10265}
10266
10267#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010268PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270\n\
10271Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010272normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273
10274static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010275unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
10277 PyObject *list;
10278 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010279 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 /* Split into words */
10282 list = split(self, NULL, -1);
10283 if (!list)
10284 return NULL;
10285
10286 /* Capitalize each word */
10287 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010288 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290 if (item == NULL)
10291 goto onError;
10292 Py_DECREF(PyList_GET_ITEM(list, i));
10293 PyList_SET_ITEM(list, i, item);
10294 }
10295
10296 /* Join the words to form a new string */
10297 item = PyUnicode_Join(NULL, list);
10298
Benjamin Peterson29060642009-01-31 22:14:21 +000010299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010301 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302}
10303#endif
10304
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010305/* Argument converter. Coerces to a single unicode character */
10306
10307static int
10308convert_uc(PyObject *obj, void *addr)
10309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010311 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010312
Benjamin Peterson14339b62009-01-31 16:36:08 +000010313 uniobj = PyUnicode_FromObject(obj);
10314 if (uniobj == NULL) {
10315 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010316 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 return 0;
10318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010320 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010321 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010322 Py_DECREF(uniobj);
10323 return 0;
10324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010326 Py_DECREF(uniobj);
10327 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010328}
10329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010330PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010331 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010333Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010334done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335
10336static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010337unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010339 Py_ssize_t marg, left;
10340 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 Py_UCS4 fillchar = ' ';
10342
Victor Stinnere9a29352011-10-01 02:14:59 +020010343 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345
Victor Stinnerc4b49542011-12-11 22:44:26 +010010346 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 return NULL;
10348
Victor Stinnerc4b49542011-12-11 22:44:26 +010010349 if (PyUnicode_GET_LENGTH(self) >= width)
10350 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
Victor Stinnerc4b49542011-12-11 22:44:26 +010010352 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353 left = marg / 2 + (marg & width & 1);
10354
Victor Stinner9310abb2011-10-05 00:59:23 +020010355 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356}
10357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358/* This function assumes that str1 and str2 are readied by the caller. */
10359
Marc-André Lemburge5034372000-08-08 08:04:29 +000010360static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010361unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 int kind1, kind2;
10364 void *data1, *data2;
10365 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 kind1 = PyUnicode_KIND(str1);
10368 kind2 = PyUnicode_KIND(str2);
10369 data1 = PyUnicode_DATA(str1);
10370 data2 = PyUnicode_DATA(str2);
10371 len1 = PyUnicode_GET_LENGTH(str1);
10372 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 for (i = 0; i < len1 && i < len2; ++i) {
10375 Py_UCS4 c1, c2;
10376 c1 = PyUnicode_READ(kind1, data1, i);
10377 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010378
10379 if (c1 != c2)
10380 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010381 }
10382
10383 return (len1 < len2) ? -1 : (len1 != len2);
10384}
10385
Alexander Belopolsky40018472011-02-26 01:02:56 +000010386int
10387PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10390 if (PyUnicode_READY(left) == -1 ||
10391 PyUnicode_READY(right) == -1)
10392 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010393 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010395 PyErr_Format(PyExc_TypeError,
10396 "Can't compare %.100s and %.100s",
10397 left->ob_type->tp_name,
10398 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399 return -1;
10400}
10401
Martin v. Löwis5b222132007-06-10 09:51:05 +000010402int
10403PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 Py_ssize_t i;
10406 int kind;
10407 void *data;
10408 Py_UCS4 chr;
10409
Victor Stinner910337b2011-10-03 03:20:16 +020010410 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 if (PyUnicode_READY(uni) == -1)
10412 return -1;
10413 kind = PyUnicode_KIND(uni);
10414 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010415 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10417 if (chr != str[i])
10418 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010419 /* This check keeps Python strings that end in '\0' from comparing equal
10420 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010422 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010423 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010425 return 0;
10426}
10427
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010428
Benjamin Peterson29060642009-01-31 22:14:21 +000010429#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010430 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010431
Alexander Belopolsky40018472011-02-26 01:02:56 +000010432PyObject *
10433PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010434{
10435 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010436
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010437 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10438 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 if (PyUnicode_READY(left) == -1 ||
10440 PyUnicode_READY(right) == -1)
10441 return NULL;
10442 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10443 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010444 if (op == Py_EQ) {
10445 Py_INCREF(Py_False);
10446 return Py_False;
10447 }
10448 if (op == Py_NE) {
10449 Py_INCREF(Py_True);
10450 return Py_True;
10451 }
10452 }
10453 if (left == right)
10454 result = 0;
10455 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010456 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010457
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010458 /* Convert the return value to a Boolean */
10459 switch (op) {
10460 case Py_EQ:
10461 v = TEST_COND(result == 0);
10462 break;
10463 case Py_NE:
10464 v = TEST_COND(result != 0);
10465 break;
10466 case Py_LE:
10467 v = TEST_COND(result <= 0);
10468 break;
10469 case Py_GE:
10470 v = TEST_COND(result >= 0);
10471 break;
10472 case Py_LT:
10473 v = TEST_COND(result == -1);
10474 break;
10475 case Py_GT:
10476 v = TEST_COND(result == 1);
10477 break;
10478 default:
10479 PyErr_BadArgument();
10480 return NULL;
10481 }
10482 Py_INCREF(v);
10483 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010484 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010485
Brian Curtindfc80e32011-08-10 20:28:54 -050010486 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010487}
10488
Alexander Belopolsky40018472011-02-26 01:02:56 +000010489int
10490PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010491{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 int kind1, kind2, kind;
10494 void *buf1, *buf2;
10495 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010496 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010497
10498 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010499 sub = PyUnicode_FromObject(element);
10500 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010501 PyErr_Format(PyExc_TypeError,
10502 "'in <string>' requires string as left operand, not %s",
10503 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (PyUnicode_READY(sub) == -1)
10507 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010508
Thomas Wouters477c8d52006-05-27 19:21:47 +000010509 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010510 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010511 Py_DECREF(sub);
10512 return -1;
10513 }
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 kind1 = PyUnicode_KIND(str);
10516 kind2 = PyUnicode_KIND(sub);
10517 kind = kind1 > kind2 ? kind1 : kind2;
10518 buf1 = PyUnicode_DATA(str);
10519 buf2 = PyUnicode_DATA(sub);
10520 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010521 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (!buf1) {
10523 Py_DECREF(sub);
10524 return -1;
10525 }
10526 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010527 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (!buf2) {
10529 Py_DECREF(sub);
10530 if (kind1 != kind) PyMem_Free(buf1);
10531 return -1;
10532 }
10533 len1 = PyUnicode_GET_LENGTH(str);
10534 len2 = PyUnicode_GET_LENGTH(sub);
10535
10536 switch(kind) {
10537 case PyUnicode_1BYTE_KIND:
10538 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10539 break;
10540 case PyUnicode_2BYTE_KIND:
10541 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10542 break;
10543 case PyUnicode_4BYTE_KIND:
10544 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10545 break;
10546 default:
10547 result = -1;
10548 assert(0);
10549 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550
10551 Py_DECREF(str);
10552 Py_DECREF(sub);
10553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 if (kind1 != kind)
10555 PyMem_Free(buf1);
10556 if (kind2 != kind)
10557 PyMem_Free(buf2);
10558
Guido van Rossum403d68b2000-03-13 15:55:09 +000010559 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010560}
10561
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562/* Concat to string or Unicode object giving a new Unicode object. */
10563
Alexander Belopolsky40018472011-02-26 01:02:56 +000010564PyObject *
10565PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010568 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569
10570 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010572 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577
10578 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010579 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010583 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010584 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 }
10587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010589 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10590 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 w = PyUnicode_New(
10594 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10595 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010598 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10599 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 Py_DECREF(u);
10601 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010602 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 Py_XDECREF(u);
10607 Py_XDECREF(v);
10608 return NULL;
10609}
10610
Victor Stinnerb0923652011-10-04 01:17:31 +020010611static void
10612unicode_append_inplace(PyObject **p_left, PyObject *right)
10613{
10614 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010615
10616 assert(PyUnicode_IS_READY(*p_left));
10617 assert(PyUnicode_IS_READY(right));
10618
10619 left_len = PyUnicode_GET_LENGTH(*p_left);
10620 right_len = PyUnicode_GET_LENGTH(right);
10621 if (left_len > PY_SSIZE_T_MAX - right_len) {
10622 PyErr_SetString(PyExc_OverflowError,
10623 "strings are too large to concat");
10624 goto error;
10625 }
10626 new_len = left_len + right_len;
10627
10628 /* Now we own the last reference to 'left', so we can resize it
10629 * in-place.
10630 */
10631 if (unicode_resize(p_left, new_len) != 0) {
10632 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10633 * deallocated so it cannot be put back into
10634 * 'variable'. The MemoryError is raised when there
10635 * is no value in 'variable', which might (very
10636 * remotely) be a cause of incompatibilities.
10637 */
10638 goto error;
10639 }
10640 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010641 copy_characters(*p_left, left_len, right, 0, right_len);
10642 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010643 return;
10644
10645error:
10646 Py_DECREF(*p_left);
10647 *p_left = NULL;
10648}
10649
Walter Dörwald1ab83302007-05-18 17:15:44 +000010650void
Victor Stinner23e56682011-10-03 03:54:37 +020010651PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010652{
Victor Stinner23e56682011-10-03 03:54:37 +020010653 PyObject *left, *res;
10654
10655 if (p_left == NULL) {
10656 if (!PyErr_Occurred())
10657 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010658 return;
10659 }
Victor Stinner23e56682011-10-03 03:54:37 +020010660 left = *p_left;
10661 if (right == NULL || !PyUnicode_Check(left)) {
10662 if (!PyErr_Occurred())
10663 PyErr_BadInternalCall();
10664 goto error;
10665 }
10666
Victor Stinnere1335c72011-10-04 20:53:03 +020010667 if (PyUnicode_READY(left))
10668 goto error;
10669 if (PyUnicode_READY(right))
10670 goto error;
10671
Victor Stinner23e56682011-10-03 03:54:37 +020010672 if (PyUnicode_CheckExact(left) && left != unicode_empty
10673 && PyUnicode_CheckExact(right) && right != unicode_empty
10674 && unicode_resizable(left)
10675 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10676 || _PyUnicode_WSTR(left) != NULL))
10677 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010678 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10679 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010680 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010681 not so different than duplicating the string. */
10682 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010683 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010684 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010685 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010686 return;
10687 }
10688 }
10689
10690 res = PyUnicode_Concat(left, right);
10691 if (res == NULL)
10692 goto error;
10693 Py_DECREF(left);
10694 *p_left = res;
10695 return;
10696
10697error:
10698 Py_DECREF(*p_left);
10699 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010700}
10701
10702void
10703PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10704{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010705 PyUnicode_Append(pleft, right);
10706 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010707}
10708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010709PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010710 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010713string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010714interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715
10716static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010717unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010719 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010720 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010721 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 int kind1, kind2, kind;
10724 void *buf1, *buf2;
10725 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
Jesus Ceaac451502011-04-20 17:09:23 +020010727 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10728 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 kind1 = PyUnicode_KIND(self);
10732 kind2 = PyUnicode_KIND(substring);
10733 kind = kind1 > kind2 ? kind1 : kind2;
10734 buf1 = PyUnicode_DATA(self);
10735 buf2 = PyUnicode_DATA(substring);
10736 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010737 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (!buf1) {
10739 Py_DECREF(substring);
10740 return NULL;
10741 }
10742 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010743 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (!buf2) {
10745 Py_DECREF(substring);
10746 if (kind1 != kind) PyMem_Free(buf1);
10747 return NULL;
10748 }
10749 len1 = PyUnicode_GET_LENGTH(self);
10750 len2 = PyUnicode_GET_LENGTH(substring);
10751
10752 ADJUST_INDICES(start, end, len1);
10753 switch(kind) {
10754 case PyUnicode_1BYTE_KIND:
10755 iresult = ucs1lib_count(
10756 ((Py_UCS1*)buf1) + start, end - start,
10757 buf2, len2, PY_SSIZE_T_MAX
10758 );
10759 break;
10760 case PyUnicode_2BYTE_KIND:
10761 iresult = ucs2lib_count(
10762 ((Py_UCS2*)buf1) + start, end - start,
10763 buf2, len2, PY_SSIZE_T_MAX
10764 );
10765 break;
10766 case PyUnicode_4BYTE_KIND:
10767 iresult = ucs4lib_count(
10768 ((Py_UCS4*)buf1) + start, end - start,
10769 buf2, len2, PY_SSIZE_T_MAX
10770 );
10771 break;
10772 default:
10773 assert(0); iresult = 0;
10774 }
10775
10776 result = PyLong_FromSsize_t(iresult);
10777
10778 if (kind1 != kind)
10779 PyMem_Free(buf1);
10780 if (kind2 != kind)
10781 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
10783 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010784
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785 return result;
10786}
10787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010788PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010789 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010791Encode S using the codec registered for encoding. Default encoding\n\
10792is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010793handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010794a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10795'xmlcharrefreplace' as well as any other name registered with\n\
10796codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
10798static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010799unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010801 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 char *encoding = NULL;
10803 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010804
Benjamin Peterson308d6372009-09-18 21:42:35 +000010805 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10806 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010808 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
10814Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010818unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010820 Py_ssize_t i, j, line_pos, src_len, incr;
10821 Py_UCS4 ch;
10822 PyObject *u;
10823 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010825 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010826 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
10828 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Antoine Pitrou22425222011-10-04 19:10:51 +020010831 if (PyUnicode_READY(self) == -1)
10832 return NULL;
10833
Thomas Wouters7e474022000-07-16 12:04:32 +000010834 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010835 src_len = PyUnicode_GET_LENGTH(self);
10836 i = j = line_pos = 0;
10837 kind = PyUnicode_KIND(self);
10838 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010839 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010840 for (; i < src_len; i++) {
10841 ch = PyUnicode_READ(kind, src_data, i);
10842 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010843 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010845 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 goto overflow;
10848 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010850 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010853 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010854 goto overflow;
10855 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 if (ch == '\n' || ch == '\r')
10858 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010860 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010861 if (!found)
10862 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010863
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010865 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 if (!u)
10867 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010868 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
Antoine Pitroue71d5742011-10-04 15:55:09 +020010870 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 for (; i < src_len; i++) {
10873 ch = PyUnicode_READ(kind, src_data, i);
10874 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010876 incr = tabsize - (line_pos % tabsize);
10877 line_pos += incr;
10878 while (incr--) {
10879 PyUnicode_WRITE(kind, dest_data, j, ' ');
10880 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010881 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 line_pos++;
10886 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010887 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010888 if (ch == '\n' || ch == '\r')
10889 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 }
10892 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010893 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010894
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010896 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898}
10899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010900PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902\n\
10903Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010904such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905arguments start and end are interpreted as in slice notation.\n\
10906\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
10909static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010912 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010913 Py_ssize_t start;
10914 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010915 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916
Jesus Ceaac451502011-04-20 17:09:23 +020010917 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10918 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (PyUnicode_READY(self) == -1)
10922 return NULL;
10923 if (PyUnicode_READY(substring) == -1)
10924 return NULL;
10925
Victor Stinner7931d9a2011-11-04 00:22:48 +010010926 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
10928 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (result == -2)
10931 return NULL;
10932
Christian Heimes217cfd12007-12-02 14:31:20 +000010933 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934}
10935
10936static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010937unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010939 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10940 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943}
10944
Guido van Rossumc2504932007-09-18 19:42:40 +000010945/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010946 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010947static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010948unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Guido van Rossumc2504932007-09-18 19:42:40 +000010950 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010951 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (_PyUnicode_HASH(self) != -1)
10954 return _PyUnicode_HASH(self);
10955 if (PyUnicode_READY(self) == -1)
10956 return -1;
10957 len = PyUnicode_GET_LENGTH(self);
10958
10959 /* The hash function as a macro, gets expanded three times below. */
10960#define HASH(P) \
10961 x = (Py_uhash_t)*P << 7; \
10962 while (--len >= 0) \
10963 x = (1000003*x) ^ (Py_uhash_t)*P++;
10964
10965 switch (PyUnicode_KIND(self)) {
10966 case PyUnicode_1BYTE_KIND: {
10967 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10968 HASH(c);
10969 break;
10970 }
10971 case PyUnicode_2BYTE_KIND: {
10972 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10973 HASH(s);
10974 break;
10975 }
10976 default: {
10977 Py_UCS4 *l;
10978 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10979 "Impossible switch case in unicode_hash");
10980 l = PyUnicode_4BYTE_DATA(self);
10981 HASH(l);
10982 break;
10983 }
10984 }
10985 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10986
Guido van Rossumc2504932007-09-18 19:42:40 +000010987 if (x == -1)
10988 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010990 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010994PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010997Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998
10999static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011002 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011003 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011004 Py_ssize_t start;
11005 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Jesus Ceaac451502011-04-20 17:09:23 +020011007 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11008 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (PyUnicode_READY(self) == -1)
11012 return NULL;
11013 if (PyUnicode_READY(substring) == -1)
11014 return NULL;
11015
Victor Stinner7931d9a2011-11-04 00:22:48 +010011016 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
11018 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 if (result == -2)
11021 return NULL;
11022
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 if (result < 0) {
11024 PyErr_SetString(PyExc_ValueError, "substring not found");
11025 return NULL;
11026 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027
Christian Heimes217cfd12007-12-02 14:31:20 +000011028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029}
11030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011031PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011034Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011035at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036
11037static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011038unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 Py_ssize_t i, length;
11041 int kind;
11042 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043 int cased;
11044
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (PyUnicode_READY(self) == -1)
11046 return NULL;
11047 length = PyUnicode_GET_LENGTH(self);
11048 kind = PyUnicode_KIND(self);
11049 data = PyUnicode_DATA(self);
11050
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (length == 1)
11053 return PyBool_FromLong(
11054 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011056 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011059
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 for (i = 0; i < length; i++) {
11062 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011063
Benjamin Peterson29060642009-01-31 22:14:21 +000011064 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11065 return PyBool_FromLong(0);
11066 else if (!cased && Py_UNICODE_ISLOWER(ch))
11067 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011069 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070}
11071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011072PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011073 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011075Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011076at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
11078static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011079unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 Py_ssize_t i, length;
11082 int kind;
11083 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084 int cased;
11085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 if (PyUnicode_READY(self) == -1)
11087 return NULL;
11088 length = PyUnicode_GET_LENGTH(self);
11089 kind = PyUnicode_KIND(self);
11090 data = PyUnicode_DATA(self);
11091
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (length == 1)
11094 return PyBool_FromLong(
11095 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011097 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011100
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 for (i = 0; i < length; i++) {
11103 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011104
Benjamin Peterson29060642009-01-31 22:14:21 +000011105 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11106 return PyBool_FromLong(0);
11107 else if (!cased && Py_UNICODE_ISUPPER(ch))
11108 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011110 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111}
11112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011113PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011116Return True if S is a titlecased string and there is at least one\n\
11117character in S, i.e. upper- and titlecase characters may only\n\
11118follow uncased characters and lowercase characters only cased ones.\n\
11119Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120
11121static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011122unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 Py_ssize_t i, length;
11125 int kind;
11126 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127 int cased, previous_is_cased;
11128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
11131 length = PyUnicode_GET_LENGTH(self);
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_DATA(self);
11134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 1) {
11137 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11138 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11139 (Py_UNICODE_ISUPPER(ch) != 0));
11140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011142 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011145
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146 cased = 0;
11147 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 for (i = 0; i < length; i++) {
11149 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011150
Benjamin Peterson29060642009-01-31 22:14:21 +000011151 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11152 if (previous_is_cased)
11153 return PyBool_FromLong(0);
11154 previous_is_cased = 1;
11155 cased = 1;
11156 }
11157 else if (Py_UNICODE_ISLOWER(ch)) {
11158 if (!previous_is_cased)
11159 return PyBool_FromLong(0);
11160 previous_is_cased = 1;
11161 cased = 1;
11162 }
11163 else
11164 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011166 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167}
11168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011169PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011172Return True if all characters in S are whitespace\n\
11173and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174
11175static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 Py_ssize_t i, length;
11179 int kind;
11180 void *data;
11181
11182 if (PyUnicode_READY(self) == -1)
11183 return NULL;
11184 length = PyUnicode_GET_LENGTH(self);
11185 kind = PyUnicode_KIND(self);
11186 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (length == 1)
11190 return PyBool_FromLong(
11191 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011193 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 for (i = 0; i < length; i++) {
11198 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011199 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011200 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011202 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203}
11204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011205PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011208Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011210
11211static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011212unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 Py_ssize_t i, length;
11215 int kind;
11216 void *data;
11217
11218 if (PyUnicode_READY(self) == -1)
11219 return NULL;
11220 length = PyUnicode_GET_LENGTH(self);
11221 kind = PyUnicode_KIND(self);
11222 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (length == 1)
11226 return PyBool_FromLong(
11227 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011228
11229 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011231 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 for (i = 0; i < length; i++) {
11234 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011237 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011242\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011243Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245
11246static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011247unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 int kind;
11250 void *data;
11251 Py_ssize_t len, i;
11252
11253 if (PyUnicode_READY(self) == -1)
11254 return NULL;
11255
11256 kind = PyUnicode_KIND(self);
11257 data = PyUnicode_DATA(self);
11258 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011259
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011260 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (len == 1) {
11262 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11263 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11264 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011265
11266 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 for (i = 0; i < len; i++) {
11271 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011272 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011275 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276}
11277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011281Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
11284static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011285unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 Py_ssize_t i, length;
11288 int kind;
11289 void *data;
11290
11291 if (PyUnicode_READY(self) == -1)
11292 return NULL;
11293 length = PyUnicode_GET_LENGTH(self);
11294 kind = PyUnicode_KIND(self);
11295 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (length == 1)
11299 return PyBool_FromLong(
11300 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011302 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 for (i = 0; i < length; i++) {
11307 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011310 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311}
11312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011316Return True if all characters in S are digits\n\
11317and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011320unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 Py_ssize_t i, length;
11323 int kind;
11324 void *data;
11325
11326 if (PyUnicode_READY(self) == -1)
11327 return NULL;
11328 length = PyUnicode_GET_LENGTH(self);
11329 kind = PyUnicode_KIND(self);
11330 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (length == 1) {
11334 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11335 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11336 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011338 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 for (i = 0; i < length; i++) {
11343 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011352Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis47383402007-08-15 07:32:56 +000011384int
11385PyUnicode_IsIdentifier(PyObject *self)
11386{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 int kind;
11388 void *data;
11389 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011390 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (PyUnicode_READY(self) == -1) {
11393 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 }
11396
11397 /* Special case for empty strings */
11398 if (PyUnicode_GET_LENGTH(self) == 0)
11399 return 0;
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011402
11403 /* PEP 3131 says that the first character must be in
11404 XID_Start and subsequent characters in XID_Continue,
11405 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011406 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011407 letters, digits, underscore). However, given the current
11408 definition of XID_Start and XID_Continue, it is sufficient
11409 to check just for these, except that _ must be allowed
11410 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011412 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011413 return 0;
11414
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011415 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011418 return 1;
11419}
11420
11421PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011422 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011423\n\
11424Return True if S is a valid identifier according\n\
11425to the language definition.");
11426
11427static PyObject*
11428unicode_isidentifier(PyObject *self)
11429{
11430 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11431}
11432
Georg Brandl559e5d72008-06-11 18:37:52 +000011433PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011435\n\
11436Return True if all characters in S are considered\n\
11437printable in repr() or S is empty, False otherwise.");
11438
11439static PyObject*
11440unicode_isprintable(PyObject *self)
11441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 Py_ssize_t i, length;
11443 int kind;
11444 void *data;
11445
11446 if (PyUnicode_READY(self) == -1)
11447 return NULL;
11448 length = PyUnicode_GET_LENGTH(self);
11449 kind = PyUnicode_KIND(self);
11450 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011451
11452 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (length == 1)
11454 return PyBool_FromLong(
11455 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 for (i = 0; i < length; i++) {
11458 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011459 Py_RETURN_FALSE;
11460 }
11461 }
11462 Py_RETURN_TRUE;
11463}
11464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011465PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011466 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467\n\
11468Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011469iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
11471static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011472unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011474 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475}
11476
Martin v. Löwis18e16552006-02-15 17:27:45 +000011477static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011478unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (PyUnicode_READY(self) == -1)
11481 return -1;
11482 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483}
11484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011488Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011489done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
11491static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011492unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011494 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 Py_UCS4 fillchar = ' ';
11496
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011497 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498 return NULL;
11499
Victor Stinnerc4b49542011-12-11 22:44:26 +010011500 if (PyUnicode_READY(self) < 0)
11501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Victor Stinnerc4b49542011-12-11 22:44:26 +010011503 if (PyUnicode_GET_LENGTH(self) >= width)
11504 return unicode_result_unchanged(self);
11505
11506 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507}
11508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011515unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 return fixup(self, fixlower);
11518}
11519
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011520#define LEFTSTRIP 0
11521#define RIGHTSTRIP 1
11522#define BOTHSTRIP 2
11523
11524/* Arrays indexed by above */
11525static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11526
11527#define STRIPNAME(i) (stripformat[i]+3)
11528
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011529/* externally visible for str.strip(unicode) */
11530PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011531_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 void *data;
11534 int kind;
11535 Py_ssize_t i, j, len;
11536 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11539 return NULL;
11540
11541 kind = PyUnicode_KIND(self);
11542 data = PyUnicode_DATA(self);
11543 len = PyUnicode_GET_LENGTH(self);
11544 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11545 PyUnicode_DATA(sepobj),
11546 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011547
Benjamin Peterson14339b62009-01-31 16:36:08 +000011548 i = 0;
11549 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 while (i < len &&
11551 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 i++;
11553 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011554 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011555
Benjamin Peterson14339b62009-01-31 16:36:08 +000011556 j = len;
11557 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 do {
11559 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 } while (j >= i &&
11561 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011563 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011564
Victor Stinner7931d9a2011-11-04 00:22:48 +010011565 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566}
11567
11568PyObject*
11569PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11570{
11571 unsigned char *data;
11572 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011573 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574
Victor Stinnerde636f32011-10-01 03:55:54 +020011575 if (PyUnicode_READY(self) == -1)
11576 return NULL;
11577
11578 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11579
Victor Stinner12bab6d2011-10-01 01:53:49 +020011580 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011581 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582
Victor Stinner12bab6d2011-10-01 01:53:49 +020011583 length = end - start;
11584 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011585 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586
Victor Stinnerde636f32011-10-01 03:55:54 +020011587 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011588 PyErr_SetString(PyExc_IndexError, "string index out of range");
11589 return NULL;
11590 }
11591
Victor Stinnerb9275c12011-10-05 14:01:42 +020011592 if (PyUnicode_IS_ASCII(self)) {
11593 kind = PyUnicode_KIND(self);
11594 data = PyUnicode_1BYTE_DATA(self);
11595 return unicode_fromascii(data + start, length);
11596 }
11597 else {
11598 kind = PyUnicode_KIND(self);
11599 data = PyUnicode_1BYTE_DATA(self);
11600 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011601 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011602 length);
11603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 int kind;
11610 void *data;
11611 Py_ssize_t len, i, j;
11612
11613 if (PyUnicode_READY(self) == -1)
11614 return NULL;
11615
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
11618 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011619
Benjamin Peterson14339b62009-01-31 16:36:08 +000011620 i = 0;
11621 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 i++;
11624 }
11625 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011626
Benjamin Peterson14339b62009-01-31 16:36:08 +000011627 j = len;
11628 if (striptype != LEFTSTRIP) {
11629 do {
11630 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632 j++;
11633 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634
Victor Stinner7931d9a2011-11-04 00:22:48 +010011635 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638
11639static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011640do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011642 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011643
Benjamin Peterson14339b62009-01-31 16:36:08 +000011644 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11645 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646
Benjamin Peterson14339b62009-01-31 16:36:08 +000011647 if (sep != NULL && sep != Py_None) {
11648 if (PyUnicode_Check(sep))
11649 return _PyUnicode_XStrip(self, striptype, sep);
11650 else {
11651 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 "%s arg must be None or str",
11653 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 return NULL;
11655 }
11656 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657
Benjamin Peterson14339b62009-01-31 16:36:08 +000011658 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011659}
11660
11661
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011663 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664\n\
11665Return a copy of the string S with leading and trailing\n\
11666whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011667If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668
11669static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011670unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011671{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011672 if (PyTuple_GET_SIZE(args) == 0)
11673 return do_strip(self, BOTHSTRIP); /* Common case */
11674 else
11675 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676}
11677
11678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011679PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681\n\
11682Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011683If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011684
11685static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011686unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011688 if (PyTuple_GET_SIZE(args) == 0)
11689 return do_strip(self, LEFTSTRIP); /* Common case */
11690 else
11691 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692}
11693
11694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697\n\
11698Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011699If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700
11701static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011702unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011703{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011704 if (PyTuple_GET_SIZE(args) == 0)
11705 return do_strip(self, RIGHTSTRIP); /* Common case */
11706 else
11707 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708}
11709
11710
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011712unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
Georg Brandl222de0f2009-04-12 12:01:50 +000011717 if (len < 1) {
11718 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011719 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721
Victor Stinnerc4b49542011-12-11 22:44:26 +010011722 /* no repeat, return original string */
11723 if (len == 1)
11724 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011725
Victor Stinnerc4b49542011-12-11 22:44:26 +010011726 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 return NULL;
11728
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011729 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011730 PyErr_SetString(PyExc_OverflowError,
11731 "repeated string is too long");
11732 return NULL;
11733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011735
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011736 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737 if (!u)
11738 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011739 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (PyUnicode_GET_LENGTH(str) == 1) {
11742 const int kind = PyUnicode_KIND(str);
11743 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11744 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011745 if (kind == PyUnicode_1BYTE_KIND)
11746 memset(to, (unsigned char)fill_char, len);
11747 else {
11748 for (n = 0; n < len; ++n)
11749 PyUnicode_WRITE(kind, to, n, fill_char);
11750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 }
11752 else {
11753 /* number of characters copied this far */
11754 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011755 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 char *to = (char *) PyUnicode_DATA(u);
11757 Py_MEMCPY(to, PyUnicode_DATA(str),
11758 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 n = (done <= nchars-done) ? done : nchars-done;
11761 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011762 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 }
11765
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011766 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011767 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768}
11769
Alexander Belopolsky40018472011-02-26 01:02:56 +000011770PyObject *
11771PyUnicode_Replace(PyObject *obj,
11772 PyObject *subobj,
11773 PyObject *replobj,
11774 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
11776 PyObject *self;
11777 PyObject *str1;
11778 PyObject *str2;
11779 PyObject *result;
11780
11781 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011782 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011785 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011786 Py_DECREF(self);
11787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 }
11789 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011790 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 Py_DECREF(self);
11792 Py_DECREF(str1);
11793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 Py_DECREF(self);
11797 Py_DECREF(str1);
11798 Py_DECREF(str2);
11799 return result;
11800}
11801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011802PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011803 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804\n\
11805Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011806old replaced by new. If the optional argument count is\n\
11807given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
11809static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 PyObject *str1;
11813 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011814 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 PyObject *result;
11816
Martin v. Löwis18e16552006-02-15 17:27:45 +000011817 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 str1 = PyUnicode_FromObject(str1);
11822 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11823 return NULL;
11824 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011825 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 Py_DECREF(str1);
11827 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
11830 result = replace(self, str1, str2, maxcount);
11831
11832 Py_DECREF(str1);
11833 Py_DECREF(str2);
11834 return result;
11835}
11836
Alexander Belopolsky40018472011-02-26 01:02:56 +000011837static PyObject *
11838unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011840 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 Py_ssize_t isize;
11842 Py_ssize_t osize, squote, dquote, i, o;
11843 Py_UCS4 max, quote;
11844 int ikind, okind;
11845 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011848 return NULL;
11849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 isize = PyUnicode_GET_LENGTH(unicode);
11851 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 /* Compute length of output, quote characters, and
11854 maximum character */
11855 osize = 2; /* quotes */
11856 max = 127;
11857 squote = dquote = 0;
11858 ikind = PyUnicode_KIND(unicode);
11859 for (i = 0; i < isize; i++) {
11860 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11861 switch (ch) {
11862 case '\'': squote++; osize++; break;
11863 case '"': dquote++; osize++; break;
11864 case '\\': case '\t': case '\r': case '\n':
11865 osize += 2; break;
11866 default:
11867 /* Fast-path ASCII */
11868 if (ch < ' ' || ch == 0x7f)
11869 osize += 4; /* \xHH */
11870 else if (ch < 0x7f)
11871 osize++;
11872 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11873 osize++;
11874 max = ch > max ? ch : max;
11875 }
11876 else if (ch < 0x100)
11877 osize += 4; /* \xHH */
11878 else if (ch < 0x10000)
11879 osize += 6; /* \uHHHH */
11880 else
11881 osize += 10; /* \uHHHHHHHH */
11882 }
11883 }
11884
11885 quote = '\'';
11886 if (squote) {
11887 if (dquote)
11888 /* Both squote and dquote present. Use squote,
11889 and escape them */
11890 osize += squote;
11891 else
11892 quote = '"';
11893 }
11894
11895 repr = PyUnicode_New(osize, max);
11896 if (repr == NULL)
11897 return NULL;
11898 okind = PyUnicode_KIND(repr);
11899 odata = PyUnicode_DATA(repr);
11900
11901 PyUnicode_WRITE(okind, odata, 0, quote);
11902 PyUnicode_WRITE(okind, odata, osize-1, quote);
11903
11904 for (i = 0, o = 1; i < isize; i++) {
11905 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011906
11907 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if ((ch == quote) || (ch == '\\')) {
11909 PyUnicode_WRITE(okind, odata, o++, '\\');
11910 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011911 continue;
11912 }
11913
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011915 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 PyUnicode_WRITE(okind, odata, o++, '\\');
11917 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011918 }
11919 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicode_WRITE(okind, odata, o++, '\\');
11921 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011922 }
11923 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011924 PyUnicode_WRITE(okind, odata, o++, '\\');
11925 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011926 }
11927
11928 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011929 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 PyUnicode_WRITE(okind, odata, o++, '\\');
11931 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011934 }
11935
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 /* Copy ASCII characters as-is */
11937 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011939 }
11940
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011942 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011943 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011944 (categories Z* and C* except ASCII space)
11945 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011947 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 if (ch <= 0xff) {
11949 PyUnicode_WRITE(okind, odata, o++, '\\');
11950 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011951 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11952 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011953 }
11954 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 else if (ch >= 0x10000) {
11956 PyUnicode_WRITE(okind, odata, o++, '\\');
11957 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11961 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11962 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11963 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11964 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11965 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011966 }
11967 /* Map 16-bit characters to '\uxxxx' */
11968 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 PyUnicode_WRITE(okind, odata, o++, '\\');
11970 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011971 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11972 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11973 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11974 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011975 }
11976 }
11977 /* Copy characters as-is */
11978 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011980 }
11981 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011984 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011985 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986}
11987
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011988PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990\n\
11991Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011992such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993arguments start and end are interpreted as in slice notation.\n\
11994\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011995Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996
11997static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012000 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012001 Py_ssize_t start;
12002 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012003 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
Jesus Ceaac451502011-04-20 17:09:23 +020012005 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12006 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012007 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (PyUnicode_READY(self) == -1)
12010 return NULL;
12011 if (PyUnicode_READY(substring) == -1)
12012 return NULL;
12013
Victor Stinner7931d9a2011-11-04 00:22:48 +010012014 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015
12016 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (result == -2)
12019 return NULL;
12020
Christian Heimes217cfd12007-12-02 14:31:20 +000012021 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022}
12023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012024PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012027Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
12029static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012032 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012033 Py_ssize_t start;
12034 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012035 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
Jesus Ceaac451502011-04-20 17:09:23 +020012037 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12038 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043 if (PyUnicode_READY(substring) == -1)
12044 return NULL;
12045
Victor Stinner7931d9a2011-11-04 00:22:48 +010012046 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
12048 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 if (result == -2)
12051 return NULL;
12052
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 if (result < 0) {
12054 PyErr_SetString(PyExc_ValueError, "substring not found");
12055 return NULL;
12056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012057
Christian Heimes217cfd12007-12-02 14:31:20 +000012058 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059}
12060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012061PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012064Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012065done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
12067static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012068unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012070 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 Py_UCS4 fillchar = ' ';
12072
Victor Stinnere9a29352011-10-01 02:14:59 +020012073 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012075
Victor Stinnerc4b49542011-12-11 22:44:26 +010012076 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077 return NULL;
12078
Victor Stinnerc4b49542011-12-11 22:44:26 +010012079 if (PyUnicode_GET_LENGTH(self) >= width)
12080 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Victor Stinnerc4b49542011-12-11 22:44:26 +010012082 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083}
12084
Alexander Belopolsky40018472011-02-26 01:02:56 +000012085PyObject *
12086PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087{
12088 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012089
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090 s = PyUnicode_FromObject(s);
12091 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012092 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012093 if (sep != NULL) {
12094 sep = PyUnicode_FromObject(sep);
12095 if (sep == NULL) {
12096 Py_DECREF(s);
12097 return NULL;
12098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099 }
12100
Victor Stinner9310abb2011-10-05 00:59:23 +020012101 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103 Py_DECREF(s);
12104 Py_XDECREF(sep);
12105 return result;
12106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
12111Return a list of the words in S, using sep as the\n\
12112delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012113splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012114whitespace string is a separator and empty strings are\n\
12115removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
12117static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012118unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119{
12120 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012121 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122
Martin v. Löwis18e16552006-02-15 17:27:45 +000012123 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 return NULL;
12125
12126 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012129 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012131 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132}
12133
Thomas Wouters477c8d52006-05-27 19:21:47 +000012134PyObject *
12135PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12136{
12137 PyObject* str_obj;
12138 PyObject* sep_obj;
12139 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 int kind1, kind2, kind;
12141 void *buf1 = NULL, *buf2 = NULL;
12142 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012143
12144 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012145 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012147 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012149 Py_DECREF(str_obj);
12150 return NULL;
12151 }
12152
Victor Stinner14f8f022011-10-05 20:58:25 +020012153 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012155 kind = Py_MAX(kind1, kind2);
12156 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012158 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (!buf1)
12160 goto onError;
12161 buf2 = PyUnicode_DATA(sep_obj);
12162 if (kind2 != kind)
12163 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12164 if (!buf2)
12165 goto onError;
12166 len1 = PyUnicode_GET_LENGTH(str_obj);
12167 len2 = PyUnicode_GET_LENGTH(sep_obj);
12168
Victor Stinner14f8f022011-10-05 20:58:25 +020012169 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012171 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12172 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12173 else
12174 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 break;
12176 case PyUnicode_2BYTE_KIND:
12177 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12178 break;
12179 case PyUnicode_4BYTE_KIND:
12180 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12181 break;
12182 default:
12183 assert(0);
12184 out = 0;
12185 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186
12187 Py_DECREF(sep_obj);
12188 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (kind1 != kind)
12190 PyMem_Free(buf1);
12191 if (kind2 != kind)
12192 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193
12194 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 onError:
12196 Py_DECREF(sep_obj);
12197 Py_DECREF(str_obj);
12198 if (kind1 != kind && buf1)
12199 PyMem_Free(buf1);
12200 if (kind2 != kind && buf2)
12201 PyMem_Free(buf2);
12202 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012203}
12204
12205
12206PyObject *
12207PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12208{
12209 PyObject* str_obj;
12210 PyObject* sep_obj;
12211 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 int kind1, kind2, kind;
12213 void *buf1 = NULL, *buf2 = NULL;
12214 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012215
12216 str_obj = PyUnicode_FromObject(str_in);
12217 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012218 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012219 sep_obj = PyUnicode_FromObject(sep_in);
12220 if (!sep_obj) {
12221 Py_DECREF(str_obj);
12222 return NULL;
12223 }
12224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 kind1 = PyUnicode_KIND(str_in);
12226 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012227 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 buf1 = PyUnicode_DATA(str_in);
12229 if (kind1 != kind)
12230 buf1 = _PyUnicode_AsKind(str_in, kind);
12231 if (!buf1)
12232 goto onError;
12233 buf2 = PyUnicode_DATA(sep_obj);
12234 if (kind2 != kind)
12235 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12236 if (!buf2)
12237 goto onError;
12238 len1 = PyUnicode_GET_LENGTH(str_obj);
12239 len2 = PyUnicode_GET_LENGTH(sep_obj);
12240
12241 switch(PyUnicode_KIND(str_in)) {
12242 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012243 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12244 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12245 else
12246 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 break;
12248 case PyUnicode_2BYTE_KIND:
12249 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12250 break;
12251 case PyUnicode_4BYTE_KIND:
12252 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12253 break;
12254 default:
12255 assert(0);
12256 out = 0;
12257 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258
12259 Py_DECREF(sep_obj);
12260 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 if (kind1 != kind)
12262 PyMem_Free(buf1);
12263 if (kind2 != kind)
12264 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265
12266 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 onError:
12268 Py_DECREF(sep_obj);
12269 Py_DECREF(str_obj);
12270 if (kind1 != kind && buf1)
12271 PyMem_Free(buf1);
12272 if (kind2 != kind && buf2)
12273 PyMem_Free(buf2);
12274 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012275}
12276
12277PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012278 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012279\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012280Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012281the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012282found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012283
12284static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012285unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012286{
Victor Stinner9310abb2011-10-05 00:59:23 +020012287 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288}
12289
12290PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012291 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012293Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012295separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296
12297static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012298unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299{
Victor Stinner9310abb2011-10-05 00:59:23 +020012300 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012301}
12302
Alexander Belopolsky40018472011-02-26 01:02:56 +000012303PyObject *
12304PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012305{
12306 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012307
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012308 s = PyUnicode_FromObject(s);
12309 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012310 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 if (sep != NULL) {
12312 sep = PyUnicode_FromObject(sep);
12313 if (sep == NULL) {
12314 Py_DECREF(s);
12315 return NULL;
12316 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012317 }
12318
Victor Stinner9310abb2011-10-05 00:59:23 +020012319 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012320
12321 Py_DECREF(s);
12322 Py_XDECREF(sep);
12323 return result;
12324}
12325
12326PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012328\n\
12329Return a list of the words in S, using sep as the\n\
12330delimiter string, starting at the end of the string and\n\
12331working to the front. If maxsplit is given, at most maxsplit\n\
12332splits are done. If sep is not specified, any whitespace string\n\
12333is a separator.");
12334
12335static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012336unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012337{
12338 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012339 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012340
Martin v. Löwis18e16552006-02-15 17:27:45 +000012341 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012342 return NULL;
12343
12344 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012346 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012347 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012348 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012349 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012350}
12351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012352PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354\n\
12355Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012356Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012357is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358
12359static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012360unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012362 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012363 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012365 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12366 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367 return NULL;
12368
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012369 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370}
12371
12372static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012373PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012375 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376}
12377
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012378PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012379 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380\n\
12381Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012382and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383
12384static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012385unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387 return fixup(self, fixswapcase);
12388}
12389
Georg Brandlceee0772007-11-27 23:48:05 +000012390PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012391 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012392\n\
12393Return a translation table usable for str.translate().\n\
12394If there is only one argument, it must be a dictionary mapping Unicode\n\
12395ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012396Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012397If there are two arguments, they must be strings of equal length, and\n\
12398in the resulting dictionary, each character in x will be mapped to the\n\
12399character at the same position in y. If there is a third argument, it\n\
12400must be a string, whose characters will be mapped to None in the result.");
12401
12402static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012403unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012404{
12405 PyObject *x, *y = NULL, *z = NULL;
12406 PyObject *new = NULL, *key, *value;
12407 Py_ssize_t i = 0;
12408 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012409
Georg Brandlceee0772007-11-27 23:48:05 +000012410 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12411 return NULL;
12412 new = PyDict_New();
12413 if (!new)
12414 return NULL;
12415 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 int x_kind, y_kind, z_kind;
12417 void *x_data, *y_data, *z_data;
12418
Georg Brandlceee0772007-11-27 23:48:05 +000012419 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012420 if (!PyUnicode_Check(x)) {
12421 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12422 "be a string if there is a second argument");
12423 goto err;
12424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012426 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12427 "arguments must have equal length");
12428 goto err;
12429 }
12430 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 x_kind = PyUnicode_KIND(x);
12432 y_kind = PyUnicode_KIND(y);
12433 x_data = PyUnicode_DATA(x);
12434 y_data = PyUnicode_DATA(y);
12435 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12436 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12437 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012438 if (!key || !value)
12439 goto err;
12440 res = PyDict_SetItem(new, key, value);
12441 Py_DECREF(key);
12442 Py_DECREF(value);
12443 if (res < 0)
12444 goto err;
12445 }
12446 /* create entries for deleting chars in z */
12447 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 z_kind = PyUnicode_KIND(z);
12449 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012450 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012452 if (!key)
12453 goto err;
12454 res = PyDict_SetItem(new, key, Py_None);
12455 Py_DECREF(key);
12456 if (res < 0)
12457 goto err;
12458 }
12459 }
12460 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 int kind;
12462 void *data;
12463
Georg Brandlceee0772007-11-27 23:48:05 +000012464 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012465 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012466 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12467 "to maketrans it must be a dict");
12468 goto err;
12469 }
12470 /* copy entries into the new dict, converting string keys to int keys */
12471 while (PyDict_Next(x, &i, &key, &value)) {
12472 if (PyUnicode_Check(key)) {
12473 /* convert string keys to integer keys */
12474 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012475 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012476 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12477 "table must be of length 1");
12478 goto err;
12479 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 kind = PyUnicode_KIND(key);
12481 data = PyUnicode_DATA(key);
12482 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012483 if (!newkey)
12484 goto err;
12485 res = PyDict_SetItem(new, newkey, value);
12486 Py_DECREF(newkey);
12487 if (res < 0)
12488 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012489 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012490 /* just keep integer keys */
12491 if (PyDict_SetItem(new, key, value) < 0)
12492 goto err;
12493 } else {
12494 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12495 "be strings or integers");
12496 goto err;
12497 }
12498 }
12499 }
12500 return new;
12501 err:
12502 Py_DECREF(new);
12503 return NULL;
12504}
12505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012506PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508\n\
12509Return a copy of the string S, where all characters have been mapped\n\
12510through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012511Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012512Unmapped characters are left untouched. Characters mapped to None\n\
12513are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514
12515static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519}
12520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012521PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012524Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
12526static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012527unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 return fixup(self, fixupper);
12530}
12531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012532PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012535Pad a numeric string S with zeros on the left, to fill a field\n\
12536of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
12538static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012539unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012541 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012542 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012543 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 int kind;
12545 void *data;
12546 Py_UCS4 chr;
12547
Martin v. Löwis18e16552006-02-15 17:27:45 +000012548 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 return NULL;
12550
Victor Stinnerc4b49542011-12-11 22:44:26 +010012551 if (PyUnicode_READY(self) < 0)
12552 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553
Victor Stinnerc4b49542011-12-11 22:44:26 +010012554 if (PyUnicode_GET_LENGTH(self) >= width)
12555 return unicode_result_unchanged(self);
12556
12557 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558
12559 u = pad(self, fill, 0, '0');
12560
Walter Dörwald068325e2002-04-15 13:36:47 +000012561 if (u == NULL)
12562 return NULL;
12563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 kind = PyUnicode_KIND(u);
12565 data = PyUnicode_DATA(u);
12566 chr = PyUnicode_READ(kind, data, fill);
12567
12568 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 PyUnicode_WRITE(kind, data, 0, chr);
12571 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572 }
12573
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012574 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012575 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
12578#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012579static PyObject *
12580unicode__decimal2ascii(PyObject *self)
12581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012583}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584#endif
12585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012586PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012589Return True if S starts with the specified prefix, False otherwise.\n\
12590With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012591With optional end, stop comparing S at that position.\n\
12592prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593
12594static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012595unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012598 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012599 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012600 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012601 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012602 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603
Jesus Ceaac451502011-04-20 17:09:23 +020012604 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012605 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012606 if (PyTuple_Check(subobj)) {
12607 Py_ssize_t i;
12608 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012609 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012610 if (substring == NULL)
12611 return NULL;
12612 result = tailmatch(self, substring, start, end, -1);
12613 Py_DECREF(substring);
12614 if (result) {
12615 Py_RETURN_TRUE;
12616 }
12617 }
12618 /* nothing matched */
12619 Py_RETURN_FALSE;
12620 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012621 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012622 if (substring == NULL) {
12623 if (PyErr_ExceptionMatches(PyExc_TypeError))
12624 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12625 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012627 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012628 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012630 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631}
12632
12633
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012634PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012635 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012637Return True if S ends with the specified suffix, False otherwise.\n\
12638With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639With optional end, stop comparing S at that position.\n\
12640suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
12642static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012643unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012647 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012648 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012649 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012650 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Jesus Ceaac451502011-04-20 17:09:23 +020012652 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012654 if (PyTuple_Check(subobj)) {
12655 Py_ssize_t i;
12656 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012659 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 result = tailmatch(self, substring, start, end, +1);
12662 Py_DECREF(substring);
12663 if (result) {
12664 Py_RETURN_TRUE;
12665 }
12666 }
12667 Py_RETURN_FALSE;
12668 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012669 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012670 if (substring == NULL) {
12671 if (PyErr_ExceptionMatches(PyExc_TypeError))
12672 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12673 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012675 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012676 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012678 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679}
12680
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012682
12683PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012685\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012686Return a formatted version of S, using substitutions from args and kwargs.\n\
12687The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012688
Eric Smith27bbca62010-11-04 17:06:58 +000012689PyDoc_STRVAR(format_map__doc__,
12690 "S.format_map(mapping) -> str\n\
12691\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012692Return a formatted version of S, using substitutions from mapping.\n\
12693The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012694
Eric Smith4a7d76d2008-05-30 18:10:19 +000012695static PyObject *
12696unicode__format__(PyObject* self, PyObject* args)
12697{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012698 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012699
12700 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12701 return NULL;
12702
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012703 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012705 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012706}
12707
Eric Smith8c663262007-08-25 02:26:07 +000012708PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012709 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012710\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012711Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012712
12713static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012714unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 Py_ssize_t size;
12717
12718 /* If it's a compact object, account for base structure +
12719 character data. */
12720 if (PyUnicode_IS_COMPACT_ASCII(v))
12721 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12722 else if (PyUnicode_IS_COMPACT(v))
12723 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012724 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 else {
12726 /* If it is a two-block object, account for base object, and
12727 for character block if present. */
12728 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012729 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012731 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 }
12733 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012734 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012735 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012737 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012738 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739
12740 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012741}
12742
12743PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012745
12746static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012747unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012748{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012749 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 if (!copy)
12751 return NULL;
12752 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012753}
12754
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755static PyMethodDef unicode_methods[] = {
12756
12757 /* Order is according to common usage: often used methods should
12758 appear first, since lookup is done sequentially. */
12759
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012760 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012761 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12762 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012763 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012764 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12765 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12766 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12767 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12768 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12769 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12770 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012772 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12773 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12774 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012775 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012776 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12777 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12778 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012779 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012781 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012782 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012783 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12784 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12785 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12786 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12787 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12788 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12789 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12790 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12791 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12792 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12793 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12794 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12795 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12796 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012797 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012798 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012799 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012800 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012801 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012802 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012803 {"maketrans", (PyCFunction) unicode_maketrans,
12804 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012805 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012806#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012807 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808#endif
12809
12810#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012811 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012812 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813#endif
12814
Benjamin Peterson14339b62009-01-31 16:36:08 +000012815 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816 {NULL, NULL}
12817};
12818
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012819static PyObject *
12820unicode_mod(PyObject *v, PyObject *w)
12821{
Brian Curtindfc80e32011-08-10 20:28:54 -050012822 if (!PyUnicode_Check(v))
12823 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012825}
12826
12827static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012828 0, /*nb_add*/
12829 0, /*nb_subtract*/
12830 0, /*nb_multiply*/
12831 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012832};
12833
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 (lenfunc) unicode_length, /* sq_length */
12836 PyUnicode_Concat, /* sq_concat */
12837 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12838 (ssizeargfunc) unicode_getitem, /* sq_item */
12839 0, /* sq_slice */
12840 0, /* sq_ass_item */
12841 0, /* sq_ass_slice */
12842 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843};
12844
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012845static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012846unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 if (PyUnicode_READY(self) == -1)
12849 return NULL;
12850
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012851 if (PyIndex_Check(item)) {
12852 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012853 if (i == -1 && PyErr_Occurred())
12854 return NULL;
12855 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012856 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012857 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012858 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012859 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012860 PyObject *result;
12861 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012862 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012863 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012867 return NULL;
12868 }
12869
12870 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012871 Py_INCREF(unicode_empty);
12872 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012874 slicelength == PyUnicode_GET_LENGTH(self)) {
12875 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012876 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012877 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012878 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012879 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012880 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012881 src_kind = PyUnicode_KIND(self);
12882 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012883 if (!PyUnicode_IS_ASCII(self)) {
12884 kind_limit = kind_maxchar_limit(src_kind);
12885 max_char = 0;
12886 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12887 ch = PyUnicode_READ(src_kind, src_data, cur);
12888 if (ch > max_char) {
12889 max_char = ch;
12890 if (max_char >= kind_limit)
12891 break;
12892 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012893 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012894 }
Victor Stinner55c99112011-10-13 01:17:06 +020012895 else
12896 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012897 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012898 if (result == NULL)
12899 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012900 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012901 dest_data = PyUnicode_DATA(result);
12902
12903 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012904 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12905 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012906 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012907 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012908 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012909 } else {
12910 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12911 return NULL;
12912 }
12913}
12914
12915static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012916 (lenfunc)unicode_length, /* mp_length */
12917 (binaryfunc)unicode_subscript, /* mp_subscript */
12918 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012919};
12920
Guido van Rossumd57fd912000-03-10 22:53:23 +000012921
Guido van Rossumd57fd912000-03-10 22:53:23 +000012922/* Helpers for PyUnicode_Format() */
12923
12924static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012925getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012927 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 (*p_argidx)++;
12930 if (arglen < 0)
12931 return args;
12932 else
12933 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934 }
12935 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937 return NULL;
12938}
12939
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012940/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012941
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012942static PyObject *
12943formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012945 char *p;
12946 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012948
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949 x = PyFloat_AsDouble(v);
12950 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012951 return NULL;
12952
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012954 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012955
Eric Smith0923d1d2009-04-16 20:16:10 +000012956 p = PyOS_double_to_string(x, type, prec,
12957 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012958 if (p == NULL)
12959 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012961 PyMem_Free(p);
12962 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
Tim Peters38fd5b62000-09-21 05:43:11 +000012965static PyObject*
12966formatlong(PyObject *val, int flags, int prec, int type)
12967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 char *buf;
12969 int len;
12970 PyObject *str; /* temporary string object. */
12971 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012972
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12974 if (!str)
12975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012977 Py_DECREF(str);
12978 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012979}
12980
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012981static Py_UCS4
12982formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012984 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012985 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012987 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012989 goto onError;
12990 }
12991 else {
12992 /* Integer input truncated to a character */
12993 long x;
12994 x = PyLong_AsLong(v);
12995 if (x == -1 && PyErr_Occurred())
12996 goto onError;
12997
Victor Stinner8faf8212011-12-08 22:14:11 +010012998 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 PyErr_SetString(PyExc_OverflowError,
13000 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013001 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013002 }
13003
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013004 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013005 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013006
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013010 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011}
13012
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013013static int
13014repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13015{
13016 int r;
13017 assert(count > 0);
13018 assert(PyUnicode_Check(obj));
13019 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013020 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013021 if (repeated == NULL)
13022 return -1;
13023 r = _PyAccu_Accumulate(acc, repeated);
13024 Py_DECREF(repeated);
13025 return r;
13026 }
13027 else {
13028 do {
13029 if (_PyAccu_Accumulate(acc, obj))
13030 return -1;
13031 } while (--count);
13032 return 0;
13033 }
13034}
13035
Alexander Belopolsky40018472011-02-26 01:02:56 +000013036PyObject *
13037PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 void *fmt;
13040 int fmtkind;
13041 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013043 int r;
13044 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013047 PyObject *temp = NULL;
13048 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013049 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013050 _PyAccu acc;
13051 static PyObject *plus, *minus, *blank, *zero, *percent;
13052
13053 if (!plus && !(plus = get_latin1_char('+')))
13054 return NULL;
13055 if (!minus && !(minus = get_latin1_char('-')))
13056 return NULL;
13057 if (!blank && !(blank = get_latin1_char(' ')))
13058 return NULL;
13059 if (!zero && !(zero = get_latin1_char('0')))
13060 return NULL;
13061 if (!percent && !(percent = get_latin1_char('%')))
13062 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013063
Guido van Rossumd57fd912000-03-10 22:53:23 +000013064 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 PyErr_BadInternalCall();
13066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013068 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013071 if (_PyAccu_Init(&acc))
13072 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 fmt = PyUnicode_DATA(uformat);
13074 fmtkind = PyUnicode_KIND(uformat);
13075 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13076 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013079 arglen = PyTuple_Size(args);
13080 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 }
13082 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 arglen = -1;
13084 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013086 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013087 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
13090 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013091 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013092 PyObject *nonfmt;
13093 Py_ssize_t nonfmtpos;
13094 nonfmtpos = fmtpos++;
13095 while (fmtcnt >= 0 &&
13096 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13097 fmtpos++;
13098 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013099 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013100 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013101 if (nonfmt == NULL)
13102 goto onError;
13103 r = _PyAccu_Accumulate(&acc, nonfmt);
13104 Py_DECREF(nonfmt);
13105 if (r)
13106 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013107 }
13108 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 /* Got a format specifier */
13110 int flags = 0;
13111 Py_ssize_t width = -1;
13112 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013114 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 int isnumok;
13116 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013117 void *pbuf = NULL;
13118 Py_ssize_t pindex, len;
13119 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 fmtpos++;
13122 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13123 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 Py_ssize_t keylen;
13125 PyObject *key;
13126 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013127
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 if (dict == NULL) {
13129 PyErr_SetString(PyExc_TypeError,
13130 "format requires a mapping");
13131 goto onError;
13132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013135 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 /* Skip over balanced parentheses */
13137 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013144 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 if (fmtcnt < 0 || pcount > 0) {
13146 PyErr_SetString(PyExc_ValueError,
13147 "incomplete format key");
13148 goto onError;
13149 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013150 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013151 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 if (key == NULL)
13153 goto onError;
13154 if (args_owned) {
13155 Py_DECREF(args);
13156 args_owned = 0;
13157 }
13158 args = PyObject_GetItem(dict, key);
13159 Py_DECREF(key);
13160 if (args == NULL) {
13161 goto onError;
13162 }
13163 args_owned = 1;
13164 arglen = -1;
13165 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013168 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 case '-': flags |= F_LJUST; continue;
13170 case '+': flags |= F_SIGN; continue;
13171 case ' ': flags |= F_BLANK; continue;
13172 case '#': flags |= F_ALT; continue;
13173 case '0': flags |= F_ZERO; continue;
13174 }
13175 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 if (c == '*') {
13178 v = getnextarg(args, arglen, &argidx);
13179 if (v == NULL)
13180 goto onError;
13181 if (!PyLong_Check(v)) {
13182 PyErr_SetString(PyExc_TypeError,
13183 "* wants int");
13184 goto onError;
13185 }
13186 width = PyLong_AsLong(v);
13187 if (width == -1 && PyErr_Occurred())
13188 goto onError;
13189 if (width < 0) {
13190 flags |= F_LJUST;
13191 width = -width;
13192 }
13193 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 }
13196 else if (c >= '0' && c <= '9') {
13197 width = c - '0';
13198 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 if (c < '0' || c > '9')
13201 break;
13202 if ((width*10) / 10 != width) {
13203 PyErr_SetString(PyExc_ValueError,
13204 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 }
13207 width = width*10 + (c - '0');
13208 }
13209 }
13210 if (c == '.') {
13211 prec = 0;
13212 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 if (c == '*') {
13215 v = getnextarg(args, arglen, &argidx);
13216 if (v == NULL)
13217 goto onError;
13218 if (!PyLong_Check(v)) {
13219 PyErr_SetString(PyExc_TypeError,
13220 "* wants int");
13221 goto onError;
13222 }
13223 prec = PyLong_AsLong(v);
13224 if (prec == -1 && PyErr_Occurred())
13225 goto onError;
13226 if (prec < 0)
13227 prec = 0;
13228 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 }
13231 else if (c >= '0' && c <= '9') {
13232 prec = c - '0';
13233 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 if (c < '0' || c > '9')
13236 break;
13237 if ((prec*10) / 10 != prec) {
13238 PyErr_SetString(PyExc_ValueError,
13239 "prec too big");
13240 goto onError;
13241 }
13242 prec = prec*10 + (c - '0');
13243 }
13244 }
13245 } /* prec */
13246 if (fmtcnt >= 0) {
13247 if (c == 'h' || c == 'l' || c == 'L') {
13248 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013250 }
13251 }
13252 if (fmtcnt < 0) {
13253 PyErr_SetString(PyExc_ValueError,
13254 "incomplete format");
13255 goto onError;
13256 }
13257 if (c != '%') {
13258 v = getnextarg(args, arglen, &argidx);
13259 if (v == NULL)
13260 goto onError;
13261 }
13262 sign = 0;
13263 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013264 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 switch (c) {
13266
13267 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013268 _PyAccu_Accumulate(&acc, percent);
13269 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013270
13271 case 's':
13272 case 'r':
13273 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013274 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 temp = v;
13276 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013277 }
13278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 if (c == 's')
13280 temp = PyObject_Str(v);
13281 else if (c == 'r')
13282 temp = PyObject_Repr(v);
13283 else
13284 temp = PyObject_ASCII(v);
13285 if (temp == NULL)
13286 goto onError;
13287 if (PyUnicode_Check(temp))
13288 /* nothing to do */;
13289 else {
13290 Py_DECREF(temp);
13291 PyErr_SetString(PyExc_TypeError,
13292 "%s argument has non-string str()");
13293 goto onError;
13294 }
13295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 if (PyUnicode_READY(temp) == -1) {
13297 Py_CLEAR(temp);
13298 goto onError;
13299 }
13300 pbuf = PyUnicode_DATA(temp);
13301 kind = PyUnicode_KIND(temp);
13302 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 if (prec >= 0 && len > prec)
13304 len = prec;
13305 break;
13306
13307 case 'i':
13308 case 'd':
13309 case 'u':
13310 case 'o':
13311 case 'x':
13312 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 isnumok = 0;
13314 if (PyNumber_Check(v)) {
13315 PyObject *iobj=NULL;
13316
13317 if (PyLong_Check(v)) {
13318 iobj = v;
13319 Py_INCREF(iobj);
13320 }
13321 else {
13322 iobj = PyNumber_Long(v);
13323 }
13324 if (iobj!=NULL) {
13325 if (PyLong_Check(iobj)) {
13326 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013327 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 Py_DECREF(iobj);
13329 if (!temp)
13330 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 if (PyUnicode_READY(temp) == -1) {
13332 Py_CLEAR(temp);
13333 goto onError;
13334 }
13335 pbuf = PyUnicode_DATA(temp);
13336 kind = PyUnicode_KIND(temp);
13337 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 sign = 1;
13339 }
13340 else {
13341 Py_DECREF(iobj);
13342 }
13343 }
13344 }
13345 if (!isnumok) {
13346 PyErr_Format(PyExc_TypeError,
13347 "%%%c format: a number is required, "
13348 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13349 goto onError;
13350 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013351 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013353 fillobj = zero;
13354 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 break;
13356
13357 case 'e':
13358 case 'E':
13359 case 'f':
13360 case 'F':
13361 case 'g':
13362 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013363 temp = formatfloat(v, flags, prec, c);
13364 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013365 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 if (PyUnicode_READY(temp) == -1) {
13367 Py_CLEAR(temp);
13368 goto onError;
13369 }
13370 pbuf = PyUnicode_DATA(temp);
13371 kind = PyUnicode_KIND(temp);
13372 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013374 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013376 fillobj = zero;
13377 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 break;
13379
13380 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381 {
13382 Py_UCS4 ch = formatchar(v);
13383 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013385 temp = _PyUnicode_FromUCS4(&ch, 1);
13386 if (temp == NULL)
13387 goto onError;
13388 pbuf = PyUnicode_DATA(temp);
13389 kind = PyUnicode_KIND(temp);
13390 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013393
13394 default:
13395 PyErr_Format(PyExc_ValueError,
13396 "unsupported format character '%c' (0x%x) "
13397 "at index %zd",
13398 (31<=c && c<=126) ? (char)c : '?',
13399 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 goto onError;
13402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 /* pbuf is initialized here. */
13404 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013406 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13407 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013409 pindex++;
13410 }
13411 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13412 signobj = plus;
13413 len--;
13414 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 }
13416 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013417 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013419 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 else
13421 sign = 0;
13422 }
13423 if (width < len)
13424 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013426 if (fill != ' ') {
13427 assert(signobj != NULL);
13428 if (_PyAccu_Accumulate(&acc, signobj))
13429 goto onError;
13430 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 if (width > len)
13432 width--;
13433 }
13434 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013436 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013438 second = get_latin1_char(
13439 PyUnicode_READ(kind, pbuf, pindex + 1));
13440 pindex += 2;
13441 if (second == NULL ||
13442 _PyAccu_Accumulate(&acc, zero) ||
13443 _PyAccu_Accumulate(&acc, second))
13444 goto onError;
13445 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013446 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 width -= 2;
13448 if (width < 0)
13449 width = 0;
13450 len -= 2;
13451 }
13452 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013453 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013454 if (repeat_accumulate(&acc, fillobj, width - len))
13455 goto onError;
13456 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 }
13458 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013459 if (sign) {
13460 assert(signobj != NULL);
13461 if (_PyAccu_Accumulate(&acc, signobj))
13462 goto onError;
13463 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13466 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013467 second = get_latin1_char(
13468 PyUnicode_READ(kind, pbuf, pindex + 1));
13469 pindex += 2;
13470 if (second == NULL ||
13471 _PyAccu_Accumulate(&acc, zero) ||
13472 _PyAccu_Accumulate(&acc, second))
13473 goto onError;
13474 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013475 }
13476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013478 if (temp != NULL) {
13479 assert(pbuf == PyUnicode_DATA(temp));
13480 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013481 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013482 else {
13483 const char *p = (const char *) pbuf;
13484 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013485 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013486 v = PyUnicode_FromKindAndData(kind, p, len);
13487 }
13488 if (v == NULL)
13489 goto onError;
13490 r = _PyAccu_Accumulate(&acc, v);
13491 Py_DECREF(v);
13492 if (r)
13493 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013494 if (width > len && repeat_accumulate(&acc, blank, width - len))
13495 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 if (dict && (argidx < arglen) && c != '%') {
13497 PyErr_SetString(PyExc_TypeError,
13498 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 goto onError;
13500 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013501 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013503 } /* until end */
13504 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 PyErr_SetString(PyExc_TypeError,
13506 "not all arguments converted during string formatting");
13507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013508 }
13509
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013510 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013513 }
13514 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013515 Py_XDECREF(temp);
13516 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013517 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013518
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013521 Py_XDECREF(temp);
13522 Py_XDECREF(second);
13523 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013524 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526 }
13527 return NULL;
13528}
13529
Jeremy Hylton938ace62002-07-17 16:30:39 +000013530static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013531unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13532
Tim Peters6d6c1a32001-08-02 04:15:00 +000013533static PyObject *
13534unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13535{
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 static char *kwlist[] = {"object", "encoding", "errors", 0};
13538 char *encoding = NULL;
13539 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013540
Benjamin Peterson14339b62009-01-31 16:36:08 +000013541 if (type != &PyUnicode_Type)
13542 return unicode_subtype_new(type, args, kwds);
13543 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013546 if (x == NULL) {
13547 Py_INCREF(unicode_empty);
13548 return unicode_empty;
13549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013550 if (encoding == NULL && errors == NULL)
13551 return PyObject_Str(x);
13552 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013554}
13555
Guido van Rossume023fe02001-08-30 03:12:59 +000013556static PyObject *
13557unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13558{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013559 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013560 Py_ssize_t length, char_size;
13561 int share_wstr, share_utf8;
13562 unsigned int kind;
13563 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013564
Benjamin Peterson14339b62009-01-31 16:36:08 +000013565 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013566
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013567 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013568 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013569 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013570 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013571 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013572 return NULL;
13573
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013574 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013575 if (self == NULL) {
13576 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013577 return NULL;
13578 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013579 kind = PyUnicode_KIND(unicode);
13580 length = PyUnicode_GET_LENGTH(unicode);
13581
13582 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013583#ifdef Py_DEBUG
13584 _PyUnicode_HASH(self) = -1;
13585#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013586 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013587#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013588 _PyUnicode_STATE(self).interned = 0;
13589 _PyUnicode_STATE(self).kind = kind;
13590 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013591 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013592 _PyUnicode_STATE(self).ready = 1;
13593 _PyUnicode_WSTR(self) = NULL;
13594 _PyUnicode_UTF8_LENGTH(self) = 0;
13595 _PyUnicode_UTF8(self) = NULL;
13596 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013597 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013598
13599 share_utf8 = 0;
13600 share_wstr = 0;
13601 if (kind == PyUnicode_1BYTE_KIND) {
13602 char_size = 1;
13603 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13604 share_utf8 = 1;
13605 }
13606 else if (kind == PyUnicode_2BYTE_KIND) {
13607 char_size = 2;
13608 if (sizeof(wchar_t) == 2)
13609 share_wstr = 1;
13610 }
13611 else {
13612 assert(kind == PyUnicode_4BYTE_KIND);
13613 char_size = 4;
13614 if (sizeof(wchar_t) == 4)
13615 share_wstr = 1;
13616 }
13617
13618 /* Ensure we won't overflow the length. */
13619 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13620 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013622 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013623 data = PyObject_MALLOC((length + 1) * char_size);
13624 if (data == NULL) {
13625 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013626 goto onError;
13627 }
13628
Victor Stinnerc3c74152011-10-02 20:39:55 +020013629 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013630 if (share_utf8) {
13631 _PyUnicode_UTF8_LENGTH(self) = length;
13632 _PyUnicode_UTF8(self) = data;
13633 }
13634 if (share_wstr) {
13635 _PyUnicode_WSTR_LENGTH(self) = length;
13636 _PyUnicode_WSTR(self) = (wchar_t *)data;
13637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013638
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013639 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013640 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013641 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013642#ifdef Py_DEBUG
13643 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13644#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013645 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013646 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013647
13648onError:
13649 Py_DECREF(unicode);
13650 Py_DECREF(self);
13651 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013652}
13653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013654PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013655 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013656\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013657Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013658encoding defaults to the current default string encoding.\n\
13659errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013660
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013661static PyObject *unicode_iter(PyObject *seq);
13662
Guido van Rossumd57fd912000-03-10 22:53:23 +000013663PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013664 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013665 "str", /* tp_name */
13666 sizeof(PyUnicodeObject), /* tp_size */
13667 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013668 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013669 (destructor)unicode_dealloc, /* tp_dealloc */
13670 0, /* tp_print */
13671 0, /* tp_getattr */
13672 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013673 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013674 unicode_repr, /* tp_repr */
13675 &unicode_as_number, /* tp_as_number */
13676 &unicode_as_sequence, /* tp_as_sequence */
13677 &unicode_as_mapping, /* tp_as_mapping */
13678 (hashfunc) unicode_hash, /* tp_hash*/
13679 0, /* tp_call*/
13680 (reprfunc) unicode_str, /* tp_str */
13681 PyObject_GenericGetAttr, /* tp_getattro */
13682 0, /* tp_setattro */
13683 0, /* tp_as_buffer */
13684 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686 unicode_doc, /* tp_doc */
13687 0, /* tp_traverse */
13688 0, /* tp_clear */
13689 PyUnicode_RichCompare, /* tp_richcompare */
13690 0, /* tp_weaklistoffset */
13691 unicode_iter, /* tp_iter */
13692 0, /* tp_iternext */
13693 unicode_methods, /* tp_methods */
13694 0, /* tp_members */
13695 0, /* tp_getset */
13696 &PyBaseObject_Type, /* tp_base */
13697 0, /* tp_dict */
13698 0, /* tp_descr_get */
13699 0, /* tp_descr_set */
13700 0, /* tp_dictoffset */
13701 0, /* tp_init */
13702 0, /* tp_alloc */
13703 unicode_new, /* tp_new */
13704 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705};
13706
13707/* Initialize the Unicode implementation */
13708
Victor Stinner3a50e702011-10-18 21:21:00 +020013709int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013711 int i;
13712
Thomas Wouters477c8d52006-05-27 19:21:47 +000013713 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013715 0x000A, /* LINE FEED */
13716 0x000D, /* CARRIAGE RETURN */
13717 0x001C, /* FILE SEPARATOR */
13718 0x001D, /* GROUP SEPARATOR */
13719 0x001E, /* RECORD SEPARATOR */
13720 0x0085, /* NEXT LINE */
13721 0x2028, /* LINE SEPARATOR */
13722 0x2029, /* PARAGRAPH SEPARATOR */
13723 };
13724
Fred Drakee4315f52000-05-09 19:53:39 +000013725 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013726 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013727 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013728 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013729 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013730
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013731 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013732 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013733 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013735
13736 /* initialize the linebreak bloom filter */
13737 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013738 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013739 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013740
13741 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013742
13743#ifdef HAVE_MBCS
13744 winver.dwOSVersionInfoSize = sizeof(winver);
13745 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13746 PyErr_SetFromWindowsErr(0);
13747 return -1;
13748 }
13749#endif
13750 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013751}
13752
13753/* Finalize the Unicode implementation */
13754
Christian Heimesa156e092008-02-16 07:38:31 +000013755int
13756PyUnicode_ClearFreeList(void)
13757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013758 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013759}
13760
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761void
Thomas Wouters78890102000-07-22 19:25:51 +000013762_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013764 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013766 Py_XDECREF(unicode_empty);
13767 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013768
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013769 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 if (unicode_latin1[i]) {
13771 Py_DECREF(unicode_latin1[i]);
13772 unicode_latin1[i] = NULL;
13773 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013774 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013775 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013776 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013778
Walter Dörwald16807132007-05-25 13:52:07 +000013779void
13780PyUnicode_InternInPlace(PyObject **p)
13781{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013782 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013783 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013784#ifdef Py_DEBUG
13785 assert(s != NULL);
13786 assert(_PyUnicode_CHECK(s));
13787#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013788 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013789 return;
13790#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013791 /* If it's a subclass, we don't really know what putting
13792 it in the interned dict might do. */
13793 if (!PyUnicode_CheckExact(s))
13794 return;
13795 if (PyUnicode_CHECK_INTERNED(s))
13796 return;
13797 if (interned == NULL) {
13798 interned = PyDict_New();
13799 if (interned == NULL) {
13800 PyErr_Clear(); /* Don't leave an exception */
13801 return;
13802 }
13803 }
13804 /* It might be that the GetItem call fails even
13805 though the key is present in the dictionary,
13806 namely when this happens during a stack overflow. */
13807 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013808 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013810
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 if (t) {
13812 Py_INCREF(t);
13813 Py_DECREF(*p);
13814 *p = t;
13815 return;
13816 }
Walter Dörwald16807132007-05-25 13:52:07 +000013817
Benjamin Peterson14339b62009-01-31 16:36:08 +000013818 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013819 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013820 PyErr_Clear();
13821 PyThreadState_GET()->recursion_critical = 0;
13822 return;
13823 }
13824 PyThreadState_GET()->recursion_critical = 0;
13825 /* The two references in interned are not counted by refcnt.
13826 The deallocator will take care of this */
13827 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013829}
13830
13831void
13832PyUnicode_InternImmortal(PyObject **p)
13833{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013834 PyUnicode_InternInPlace(p);
13835 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013836 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013837 Py_INCREF(*p);
13838 }
Walter Dörwald16807132007-05-25 13:52:07 +000013839}
13840
13841PyObject *
13842PyUnicode_InternFromString(const char *cp)
13843{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013844 PyObject *s = PyUnicode_FromString(cp);
13845 if (s == NULL)
13846 return NULL;
13847 PyUnicode_InternInPlace(&s);
13848 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013849}
13850
Alexander Belopolsky40018472011-02-26 01:02:56 +000013851void
13852_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013853{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013854 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013855 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 Py_ssize_t i, n;
13857 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013858
Benjamin Peterson14339b62009-01-31 16:36:08 +000013859 if (interned == NULL || !PyDict_Check(interned))
13860 return;
13861 keys = PyDict_Keys(interned);
13862 if (keys == NULL || !PyList_Check(keys)) {
13863 PyErr_Clear();
13864 return;
13865 }
Walter Dörwald16807132007-05-25 13:52:07 +000013866
Benjamin Peterson14339b62009-01-31 16:36:08 +000013867 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13868 detector, interned unicode strings are not forcibly deallocated;
13869 rather, we give them their stolen references back, and then clear
13870 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013871
Benjamin Peterson14339b62009-01-31 16:36:08 +000013872 n = PyList_GET_SIZE(keys);
13873 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013876 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013877 if (PyUnicode_READY(s) == -1) {
13878 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013879 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013882 case SSTATE_NOT_INTERNED:
13883 /* XXX Shouldn't happen */
13884 break;
13885 case SSTATE_INTERNED_IMMORTAL:
13886 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013887 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 break;
13889 case SSTATE_INTERNED_MORTAL:
13890 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013891 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 break;
13893 default:
13894 Py_FatalError("Inconsistent interned string state.");
13895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013896 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 }
13898 fprintf(stderr, "total size of all interned strings: "
13899 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13900 "mortal/immortal\n", mortal_size, immortal_size);
13901 Py_DECREF(keys);
13902 PyDict_Clear(interned);
13903 Py_DECREF(interned);
13904 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013905}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013906
13907
13908/********************* Unicode Iterator **************************/
13909
13910typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013911 PyObject_HEAD
13912 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013914} unicodeiterobject;
13915
13916static void
13917unicodeiter_dealloc(unicodeiterobject *it)
13918{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 _PyObject_GC_UNTRACK(it);
13920 Py_XDECREF(it->it_seq);
13921 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013922}
13923
13924static int
13925unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13926{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013927 Py_VISIT(it->it_seq);
13928 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013929}
13930
13931static PyObject *
13932unicodeiter_next(unicodeiterobject *it)
13933{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013935
Benjamin Peterson14339b62009-01-31 16:36:08 +000013936 assert(it != NULL);
13937 seq = it->it_seq;
13938 if (seq == NULL)
13939 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013940 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013942 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13943 int kind = PyUnicode_KIND(seq);
13944 void *data = PyUnicode_DATA(seq);
13945 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13946 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013947 if (item != NULL)
13948 ++it->it_index;
13949 return item;
13950 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013951
Benjamin Peterson14339b62009-01-31 16:36:08 +000013952 Py_DECREF(seq);
13953 it->it_seq = NULL;
13954 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013955}
13956
13957static PyObject *
13958unicodeiter_len(unicodeiterobject *it)
13959{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 Py_ssize_t len = 0;
13961 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013962 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013964}
13965
13966PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13967
13968static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013970 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972};
13973
13974PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13976 "str_iterator", /* tp_name */
13977 sizeof(unicodeiterobject), /* tp_basicsize */
13978 0, /* tp_itemsize */
13979 /* methods */
13980 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13981 0, /* tp_print */
13982 0, /* tp_getattr */
13983 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013984 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 0, /* tp_repr */
13986 0, /* tp_as_number */
13987 0, /* tp_as_sequence */
13988 0, /* tp_as_mapping */
13989 0, /* tp_hash */
13990 0, /* tp_call */
13991 0, /* tp_str */
13992 PyObject_GenericGetAttr, /* tp_getattro */
13993 0, /* tp_setattro */
13994 0, /* tp_as_buffer */
13995 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13996 0, /* tp_doc */
13997 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13998 0, /* tp_clear */
13999 0, /* tp_richcompare */
14000 0, /* tp_weaklistoffset */
14001 PyObject_SelfIter, /* tp_iter */
14002 (iternextfunc)unicodeiter_next, /* tp_iternext */
14003 unicodeiter_methods, /* tp_methods */
14004 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014005};
14006
14007static PyObject *
14008unicode_iter(PyObject *seq)
14009{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014011
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 if (!PyUnicode_Check(seq)) {
14013 PyErr_BadInternalCall();
14014 return NULL;
14015 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014016 if (PyUnicode_READY(seq) == -1)
14017 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14019 if (it == NULL)
14020 return NULL;
14021 it->it_index = 0;
14022 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014023 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 _PyObject_GC_TRACK(it);
14025 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014026}
14027
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014028
14029size_t
14030Py_UNICODE_strlen(const Py_UNICODE *u)
14031{
14032 int res = 0;
14033 while(*u++)
14034 res++;
14035 return res;
14036}
14037
14038Py_UNICODE*
14039Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14040{
14041 Py_UNICODE *u = s1;
14042 while ((*u++ = *s2++));
14043 return s1;
14044}
14045
14046Py_UNICODE*
14047Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14048{
14049 Py_UNICODE *u = s1;
14050 while ((*u++ = *s2++))
14051 if (n-- == 0)
14052 break;
14053 return s1;
14054}
14055
14056Py_UNICODE*
14057Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14058{
14059 Py_UNICODE *u1 = s1;
14060 u1 += Py_UNICODE_strlen(u1);
14061 Py_UNICODE_strcpy(u1, s2);
14062 return s1;
14063}
14064
14065int
14066Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14067{
14068 while (*s1 && *s2 && *s1 == *s2)
14069 s1++, s2++;
14070 if (*s1 && *s2)
14071 return (*s1 < *s2) ? -1 : +1;
14072 if (*s1)
14073 return 1;
14074 if (*s2)
14075 return -1;
14076 return 0;
14077}
14078
14079int
14080Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14081{
14082 register Py_UNICODE u1, u2;
14083 for (; n != 0; n--) {
14084 u1 = *s1;
14085 u2 = *s2;
14086 if (u1 != u2)
14087 return (u1 < u2) ? -1 : +1;
14088 if (u1 == '\0')
14089 return 0;
14090 s1++;
14091 s2++;
14092 }
14093 return 0;
14094}
14095
14096Py_UNICODE*
14097Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14098{
14099 const Py_UNICODE *p;
14100 for (p = s; *p; p++)
14101 if (*p == c)
14102 return (Py_UNICODE*)p;
14103 return NULL;
14104}
14105
14106Py_UNICODE*
14107Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14108{
14109 const Py_UNICODE *p;
14110 p = s + Py_UNICODE_strlen(s);
14111 while (p != s) {
14112 p--;
14113 if (*p == c)
14114 return (Py_UNICODE*)p;
14115 }
14116 return NULL;
14117}
Victor Stinner331ea922010-08-10 16:37:20 +000014118
Victor Stinner71133ff2010-09-01 23:43:53 +000014119Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014120PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014121{
Victor Stinner577db2c2011-10-11 22:12:48 +020014122 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014123 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014125 if (!PyUnicode_Check(unicode)) {
14126 PyErr_BadArgument();
14127 return NULL;
14128 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014129 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014130 if (u == NULL)
14131 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014132 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014133 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014134 PyErr_NoMemory();
14135 return NULL;
14136 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014137 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014138 size *= sizeof(Py_UNICODE);
14139 copy = PyMem_Malloc(size);
14140 if (copy == NULL) {
14141 PyErr_NoMemory();
14142 return NULL;
14143 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014144 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014145 return copy;
14146}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014147
Georg Brandl66c221e2010-10-14 07:04:07 +000014148/* A _string module, to export formatter_parser and formatter_field_name_split
14149 to the string.Formatter class implemented in Python. */
14150
14151static PyMethodDef _string_methods[] = {
14152 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14153 METH_O, PyDoc_STR("split the argument as a field name")},
14154 {"formatter_parser", (PyCFunction) formatter_parser,
14155 METH_O, PyDoc_STR("parse the argument as a format string")},
14156 {NULL, NULL}
14157};
14158
14159static struct PyModuleDef _string_module = {
14160 PyModuleDef_HEAD_INIT,
14161 "_string",
14162 PyDoc_STR("string helper module"),
14163 0,
14164 _string_methods,
14165 NULL,
14166 NULL,
14167 NULL,
14168 NULL
14169};
14170
14171PyMODINIT_FUNC
14172PyInit__string(void)
14173{
14174 return PyModule_Create(&_string_module);
14175}
14176
14177
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014178#ifdef __cplusplus
14179}
14180#endif