blob: d6a250e30e2a50f69799d20972a0511f40ee1cef [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinner3a50e702011-10-18 21:21:00 +0200490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
Thomas Wouters477c8d52006-05-27 19:21:47 +0000494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497 to keep things simple, we use a single bitmask, using the least 5
498 bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
Antoine Pitrouf068f942010-01-13 14:19:12 +0000502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518
Benjamin Peterson29060642009-01-31 22:14:21 +0000519#define BLOOM_LINEBREAK(ch) \
520 ((ch) < 128U ? ascii_linebreak[(ch)] : \
521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522
Alexander Belopolsky40018472011-02-26 01:02:56 +0000523Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525{
526 /* calculate simple bloom-style bitmask for a given unicode string */
527
Antoine Pitrouf068f942010-01-13 14:19:12 +0000528 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 Py_ssize_t i;
530
531 mask = 0;
532 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534
535 return mask;
536}
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#define BLOOM_MEMBER(mask, chr, str) \
539 (BLOOM(mask, chr) \
540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100588#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590/* --- Unicode Object ----------------------------------------------------- */
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596 Py_ssize_t size, Py_UCS4 ch,
597 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601 switch (kind) {
602 case PyUnicode_1BYTE_KIND:
603 {
604 Py_UCS1 ch1 = (Py_UCS1) ch;
605 if (ch1 == ch)
606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607 else
608 return -1;
609 }
610 case PyUnicode_2BYTE_KIND:
611 {
612 Py_UCS2 ch2 = (Py_UCS2) ch;
613 if (ch2 == ch)
614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615 else
616 return -1;
617 }
618 case PyUnicode_4BYTE_KIND:
619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620 default:
621 assert(0);
622 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624}
625
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629 Py_ssize_t char_size;
630 Py_ssize_t struct_size;
631 Py_ssize_t new_size;
632 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100633 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200634
635 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200636 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 if (PyUnicode_IS_COMPACT_ASCII(unicode))
638 struct_size = sizeof(PyASCIIObject);
639 else
640 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200641 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100644 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
Victor Stinner84def372011-12-11 20:04:56 +0100650 _Py_DEC_REFTOTAL;
651 _Py_ForgetReference(unicode);
652
653 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
654 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200655 PyObject_Del(unicode);
656 PyErr_NoMemory();
657 return NULL;
658 }
Victor Stinner84def372011-12-11 20:04:56 +0100659 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200663 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200664 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
666 _PyUnicode_WSTR_LENGTH(unicode) = length;
667 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
669 length, 0);
670 return unicode;
671}
672
Alexander Belopolsky40018472011-02-26 01:02:56 +0000673static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200674resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675{
Victor Stinner95663112011-10-04 01:03:50 +0200676 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000679
Victor Stinner95663112011-10-04 01:03:50 +0200680 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681
682 if (PyUnicode_IS_READY(unicode)) {
683 Py_ssize_t char_size;
684 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200685 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 void *data;
687
688 data = _PyUnicode_DATA_ANY(unicode);
689 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200690 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200691 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
692 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200693 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
694 {
695 PyObject_DEL(_PyUnicode_UTF8(unicode));
696 _PyUnicode_UTF8(unicode) = NULL;
697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
698 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699
700 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
701 PyErr_NoMemory();
702 return -1;
703 }
704 new_size = (length + 1) * char_size;
705
706 data = (PyObject *)PyObject_REALLOC(data, new_size);
707 if (data == NULL) {
708 PyErr_NoMemory();
709 return -1;
710 }
711 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200712 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 _PyUnicode_WSTR_LENGTH(unicode) = length;
715 }
716 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200717 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_UTF8_LENGTH(unicode) = length;
719 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 _PyUnicode_LENGTH(unicode) = length;
721 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200722 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200723 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 }
Victor Stinner95663112011-10-04 01:03:50 +0200727 assert(_PyUnicode_WSTR(unicode) != NULL);
728
729 /* check for integer overflow */
730 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 wstr = _PyUnicode_WSTR(unicode);
735 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
736 if (!wstr) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 _PyUnicode_WSTR(unicode) = wstr;
741 _PyUnicode_WSTR(unicode)[length] = 0;
742 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200743 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 return 0;
745}
746
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747static PyObject*
748resize_copy(PyObject *unicode, Py_ssize_t length)
749{
750 Py_ssize_t copy_length;
751 if (PyUnicode_IS_COMPACT(unicode)) {
752 PyObject *copy;
753 assert(PyUnicode_IS_READY(unicode));
754
755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
756 if (copy == NULL)
757 return NULL;
758
759 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200760 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200762 }
763 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 assert(_PyUnicode_WSTR(unicode) != NULL);
766 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200767 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (w == NULL)
769 return NULL;
770 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
771 copy_length = Py_MIN(copy_length, length);
772 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
773 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200774 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
776}
777
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000779 Ux0000 terminated; some code (e.g. new_identifier)
780 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781
782 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000783 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784
785*/
786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200788static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789#endif
790
Alexander Belopolsky40018472011-02-26 01:02:56 +0000791static PyUnicodeObject *
792_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793{
794 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798 if (length == 0 && unicode_empty != NULL) {
799 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200800 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801 }
802
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000803 /* Ensure we won't overflow the size. */
804 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
805 return (PyUnicodeObject *)PyErr_NoMemory();
806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (length < 0) {
808 PyErr_SetString(PyExc_SystemError,
809 "Negative size passed to _PyUnicode_New");
810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 }
812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813#ifdef Py_DEBUG
814 ++unicode_old_new_calls;
815#endif
816
817 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
818 if (unicode == NULL)
819 return NULL;
820 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
821 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
822 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000823 PyErr_NoMemory();
824 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826
Jeremy Hyltond8082792003-09-16 19:41:39 +0000827 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000828 * the caller fails before initializing str -- unicode_resize()
829 * reads str[0], and the Keep-Alive optimization can keep memory
830 * allocated for str alive across a call to unicode_dealloc(unicode).
831 * We don't want unicode_resize to read uninitialized memory in
832 * that case.
833 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 _PyUnicode_WSTR(unicode)[0] = 0;
835 _PyUnicode_WSTR(unicode)[length] = 0;
836 _PyUnicode_WSTR_LENGTH(unicode) = length;
837 _PyUnicode_HASH(unicode) = -1;
838 _PyUnicode_STATE(unicode).interned = 0;
839 _PyUnicode_STATE(unicode).kind = 0;
840 _PyUnicode_STATE(unicode).compact = 0;
841 _PyUnicode_STATE(unicode).ready = 0;
842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200843 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = NULL;
846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100847 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000849
Benjamin Peterson29060642009-01-31 22:14:21 +0000850 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000851 /* XXX UNREF/NEWREF interface should be more symmetrical */
852 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000853 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000854 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856}
857
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858static const char*
859unicode_kind_name(PyObject *unicode)
860{
Victor Stinner42dfd712011-10-03 14:41:45 +0200861 /* don't check consistency: unicode_kind_name() is called from
862 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863 if (!PyUnicode_IS_COMPACT(unicode))
864 {
865 if (!PyUnicode_IS_READY(unicode))
866 return "wstr";
867 switch(PyUnicode_KIND(unicode))
868 {
869 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200870 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 return "legacy ascii";
872 else
873 return "legacy latin1";
874 case PyUnicode_2BYTE_KIND:
875 return "legacy UCS2";
876 case PyUnicode_4BYTE_KIND:
877 return "legacy UCS4";
878 default:
879 return "<legacy invalid kind>";
880 }
881 }
882 assert(PyUnicode_IS_READY(unicode));
883 switch(PyUnicode_KIND(unicode))
884 {
885 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 return "ascii";
888 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200891 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 default:
895 return "<invalid compact kind>";
896 }
897}
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200900static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901
902/* Functions wrapping macros for use in debugger */
903char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905}
906
907void *_PyUnicode_compact_data(void *unicode) {
908 return _PyUnicode_COMPACT_DATA(unicode);
909}
910void *_PyUnicode_data(void *unicode){
911 printf("obj %p\n", unicode);
912 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
913 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
914 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
915 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
916 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
917 return PyUnicode_DATA(unicode);
918}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200919
920void
921_PyUnicode_Dump(PyObject *op)
922{
923 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200924 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
925 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
926 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200927
Victor Stinnera849a4b2011-10-03 12:12:11 +0200928 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200929 {
930 if (ascii->state.ascii)
931 data = (ascii + 1);
932 else
933 data = (compact + 1);
934 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 else
936 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200937 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->wstr == data)
940 printf("shared ");
941 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200942
Victor Stinnera3b334d2011-10-03 13:53:37 +0200943 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(" (%zu), ", compact->wstr_length);
945 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
946 printf("shared ");
947 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951#endif
952
953PyObject *
954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
955{
956 PyObject *obj;
957 PyCompactUnicodeObject *unicode;
958 void *data;
959 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200960 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 Py_ssize_t char_size;
962 Py_ssize_t struct_size;
963
964 /* Optimization for empty strings */
965 if (size == 0 && unicode_empty != NULL) {
966 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200967 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 }
969
970#ifdef Py_DEBUG
971 ++unicode_new_new_calls;
972#endif
973
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 is_ascii = 0;
975 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 struct_size = sizeof(PyCompactUnicodeObject);
977 if (maxchar < 128) {
978 kind_state = PyUnicode_1BYTE_KIND;
979 char_size = 1;
980 is_ascii = 1;
981 struct_size = sizeof(PyASCIIObject);
982 }
983 else if (maxchar < 256) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 }
987 else if (maxchar < 65536) {
988 kind_state = PyUnicode_2BYTE_KIND;
989 char_size = 2;
990 if (sizeof(wchar_t) == 2)
991 is_sharing = 1;
992 }
993 else {
994 kind_state = PyUnicode_4BYTE_KIND;
995 char_size = 4;
996 if (sizeof(wchar_t) == 4)
997 is_sharing = 1;
998 }
999
1000 /* Ensure we won't overflow the size. */
1001 if (size < 0) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "Negative size passed to PyUnicode_New");
1004 return NULL;
1005 }
1006 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1007 return PyErr_NoMemory();
1008
1009 /* Duplicated allocation code from _PyObject_New() instead of a call to
1010 * PyObject_New() so we are able to allocate space for the object and
1011 * it's data buffer.
1012 */
1013 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1014 if (obj == NULL)
1015 return PyErr_NoMemory();
1016 obj = PyObject_INIT(obj, &PyUnicode_Type);
1017 if (obj == NULL)
1018 return NULL;
1019
1020 unicode = (PyCompactUnicodeObject *)obj;
1021 if (is_ascii)
1022 data = ((PyASCIIObject*)obj) + 1;
1023 else
1024 data = unicode + 1;
1025 _PyUnicode_LENGTH(unicode) = size;
1026 _PyUnicode_HASH(unicode) = -1;
1027 _PyUnicode_STATE(unicode).interned = 0;
1028 _PyUnicode_STATE(unicode).kind = kind_state;
1029 _PyUnicode_STATE(unicode).compact = 1;
1030 _PyUnicode_STATE(unicode).ready = 1;
1031 _PyUnicode_STATE(unicode).ascii = is_ascii;
1032 if (is_ascii) {
1033 ((char*)data)[size] = 0;
1034 _PyUnicode_WSTR(unicode) = NULL;
1035 }
1036 else if (kind_state == PyUnicode_1BYTE_KIND) {
1037 ((char*)data)[size] = 0;
1038 _PyUnicode_WSTR(unicode) = NULL;
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001041 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 }
1043 else {
1044 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001045 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 if (kind_state == PyUnicode_2BYTE_KIND)
1047 ((Py_UCS2*)data)[size] = 0;
1048 else /* kind_state == PyUnicode_4BYTE_KIND */
1049 ((Py_UCS4*)data)[size] = 0;
1050 if (is_sharing) {
1051 _PyUnicode_WSTR_LENGTH(unicode) = size;
1052 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1053 }
1054 else {
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
1058 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001059 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return obj;
1061}
1062
1063#if SIZEOF_WCHAR_T == 2
1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1065 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001066 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067
1068 This function assumes that unicode can hold one more code point than wstr
1069 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001070static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001072 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073{
1074 const wchar_t *iter;
1075 Py_UCS4 *ucs4_out;
1076
Victor Stinner910337b2011-10-03 03:20:16 +02001077 assert(unicode != NULL);
1078 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1080 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1081
1082 for (iter = begin; iter < end; ) {
1083 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1084 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001085 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1086 && (iter+1) < end
1087 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 {
Victor Stinner551ac952011-11-29 22:58:13 +01001089 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 iter += 2;
1091 }
1092 else {
1093 *ucs4_out++ = *iter;
1094 iter++;
1095 }
1096 }
1097 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
1099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100}
1101#endif
1102
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103static int
1104_PyUnicode_Dirty(PyObject *unicode)
1105{
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001107 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001108 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001109 "Cannot modify a string having more than 1 reference");
1110 return -1;
1111 }
1112 _PyUnicode_DIRTY(unicode);
1113 return 0;
1114}
1115
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001116static int
1117_copy_characters(PyObject *to, Py_ssize_t to_start,
1118 PyObject *from, Py_ssize_t from_start,
1119 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 unsigned int from_kind, to_kind;
1122 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001123 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001125 assert(PyUnicode_Check(from));
1126 assert(PyUnicode_Check(to));
1127 assert(PyUnicode_IS_READY(from));
1128 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1131 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1132 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001134 if (how_many == 0)
1135 return 0;
1136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001140 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142#ifdef Py_DEBUG
1143 if (!check_maxchar
1144 && (from_kind > to_kind
1145 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1148 Py_UCS4 ch;
1149 Py_ssize_t i;
1150 for (i=0; i < how_many; i++) {
1151 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1152 assert(ch <= to_maxchar);
1153 }
1154 }
1155#endif
1156 fast = (from_kind == to_kind);
1157 if (check_maxchar
1158 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1159 {
1160 /* deny latin1 => ascii */
1161 fast = 0;
1162 }
1163
1164 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001165 Py_MEMCPY((char*)to_data + to_kind * to_start,
1166 (char*)from_data + from_kind * from_start,
1167 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 else if (from_kind == PyUnicode_1BYTE_KIND
1170 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001171 {
1172 _PyUnicode_CONVERT_BYTES(
1173 Py_UCS1, Py_UCS2,
1174 PyUnicode_1BYTE_DATA(from) + from_start,
1175 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1176 PyUnicode_2BYTE_DATA(to) + to_start
1177 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001178 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001180 && to_kind == PyUnicode_4BYTE_KIND)
1181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS4,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_4BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
1189 else if (from_kind == PyUnicode_2BYTE_KIND
1190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS2, Py_UCS4,
1194 PyUnicode_2BYTE_DATA(from) + from_start,
1195 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 /* check if max_char(from substring) <= max_char(to) */
1201 if (from_kind > to_kind
1202 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001203 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001204 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 /* slow path to check for character overflow */
1206 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 Py_ssize_t i;
1209
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 for (i=0; i < how_many; i++) {
1212 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001213 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#else
1217 if (!check_maxchar) {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
1222 }
1223 else {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 if (ch > to_maxchar)
1227 return 1;
1228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
1230 }
1231#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(0 && "inconsistent state");
1235 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 }
1237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 return 0;
1239}
1240
1241static void
1242copy_characters(PyObject *to, Py_ssize_t to_start,
1243 PyObject *from, Py_ssize_t from_start,
1244 Py_ssize_t how_many)
1245{
1246 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1247}
1248
1249Py_ssize_t
1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1251 PyObject *from, Py_ssize_t from_start,
1252 Py_ssize_t how_many)
1253{
1254 int err;
1255
1256 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1257 PyErr_BadInternalCall();
1258 return -1;
1259 }
1260
1261 if (PyUnicode_READY(from))
1262 return -1;
1263 if (PyUnicode_READY(to))
1264 return -1;
1265
1266 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1267 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1268 PyErr_Format(PyExc_SystemError,
1269 "Cannot write %zi characters at %zi "
1270 "in a string of %zi characters",
1271 how_many, to_start, PyUnicode_GET_LENGTH(to));
1272 return -1;
1273 }
1274
1275 if (how_many == 0)
1276 return 0;
1277
1278 if (_PyUnicode_Dirty(to))
1279 return -1;
1280
1281 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1282 if (err) {
1283 PyErr_Format(PyExc_SystemError,
1284 "Cannot copy %s characters "
1285 "into a string of %s characters",
1286 unicode_kind_name(from),
1287 unicode_kind_name(to));
1288 return -1;
1289 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001290 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291}
1292
Victor Stinner17222162011-09-28 22:15:37 +02001293/* Find the maximum code point and count the number of surrogate pairs so a
1294 correct string length can be computed before converting a string to UCS4.
1295 This function counts single surrogates as a character and not as a pair.
1296
1297 Return 0 on success, or -1 on error. */
1298static int
1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1300 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301{
1302 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001303 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304
Victor Stinnerc53be962011-10-02 21:33:54 +02001305 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 *num_surrogates = 0;
1307 *maxchar = 0;
1308
1309 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312 && (iter+1) < end
1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001315 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 iter += 2;
1318 }
1319 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 {
1322 ch = *iter;
1323 iter++;
1324 }
1325 if (ch > *maxchar) {
1326 *maxchar = ch;
1327 if (*maxchar > MAX_UNICODE) {
1328 PyErr_Format(PyExc_ValueError,
1329 "character U+%x is not in range [U+0000; U+10ffff]",
1330 ch);
1331 return -1;
1332 }
1333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 }
1335 return 0;
1336}
1337
1338#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001339static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001342int
1343_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 wchar_t *end;
1346 Py_UCS4 maxchar = 0;
1347 Py_ssize_t num_surrogates;
1348#if SIZEOF_WCHAR_T == 2
1349 Py_ssize_t length_wo_surrogates;
1350#endif
1351
Georg Brandl7597add2011-10-05 16:36:47 +02001352 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001353 strings were created using _PyObject_New() and where no canonical
1354 representation (the str field) has been set yet aka strings
1355 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001356 assert(_PyUnicode_CHECK(unicode));
1357 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001360 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001361 /* Actually, it should neither be interned nor be anything else: */
1362 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363
1364#ifdef Py_DEBUG
1365 ++unicode_ready_calls;
1366#endif
1367
1368 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001369 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001370 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372
1373 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1375 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 PyErr_NoMemory();
1377 return -1;
1378 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001379 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 _PyUnicode_WSTR(unicode), end,
1381 PyUnicode_1BYTE_DATA(unicode));
1382 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1383 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1384 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1385 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001386 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 }
1390 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8(unicode) = NULL;
1393 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 PyObject_FREE(_PyUnicode_WSTR(unicode));
1396 _PyUnicode_WSTR(unicode) = NULL;
1397 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398 }
1399 /* In this case we might have to convert down from 4-byte native
1400 wchar_t to 2-byte unicode. */
1401 else if (maxchar < 65536) {
1402 assert(num_surrogates == 0 &&
1403 "FindMaxCharAndNumSurrogatePairs() messed up");
1404
Victor Stinner506f5922011-09-28 22:34:18 +02001405#if SIZEOF_WCHAR_T == 2
1406 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001408 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001411 _PyUnicode_UTF8(unicode) = NULL;
1412 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001413#else
1414 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001416 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyErr_NoMemory();
1419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
Victor Stinner506f5922011-09-28 22:34:18 +02001421 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1422 _PyUnicode_WSTR(unicode), end,
1423 PyUnicode_2BYTE_DATA(unicode));
1424 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1425 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1426 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001427 _PyUnicode_UTF8(unicode) = NULL;
1428 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyObject_FREE(_PyUnicode_WSTR(unicode));
1430 _PyUnicode_WSTR(unicode) = NULL;
1431 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1432#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1435 else {
1436#if SIZEOF_WCHAR_T == 2
1437 /* in case the native representation is 2-bytes, we need to allocate a
1438 new normalized 4-byte version. */
1439 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1441 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 PyErr_NoMemory();
1443 return -1;
1444 }
1445 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1446 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001447 _PyUnicode_UTF8(unicode) = NULL;
1448 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001449 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1450 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001451 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyObject_FREE(_PyUnicode_WSTR(unicode));
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1455#else
1456 assert(num_surrogates == 0);
1457
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 _PyUnicode_UTF8(unicode) = NULL;
1461 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1463#endif
1464 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1465 }
1466 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001467 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 return 0;
1469}
1470
Alexander Belopolsky40018472011-02-26 01:02:56 +00001471static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473{
Walter Dörwald16807132007-05-25 13:52:07 +00001474 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 case SSTATE_NOT_INTERNED:
1476 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001477
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 case SSTATE_INTERNED_MORTAL:
1479 /* revive dead object temporarily for DelItem */
1480 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001481 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 Py_FatalError(
1483 "deletion of interned string failed");
1484 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001485
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 case SSTATE_INTERNED_IMMORTAL:
1487 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001488
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 default:
1490 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001491 }
1492
Victor Stinner03490912011-10-03 23:45:12 +02001493 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001495 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001496 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497
1498 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001499 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 }
1501 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 if (_PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001504 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
1506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (Py_REFCNT(unicode) != 1)
1529 return 0;
1530 if (PyUnicode_CHECK_INTERNED(unicode))
1531 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001532#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001533 /* singleton refcount is greater than 1 */
1534 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001535#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 return 1;
1537}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001538
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539static int
1540unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1541{
1542 PyObject *unicode;
1543 Py_ssize_t old_length;
1544
1545 assert(p_unicode != NULL);
1546 unicode = *p_unicode;
1547
1548 assert(unicode != NULL);
1549 assert(PyUnicode_Check(unicode));
1550 assert(0 <= length);
1551
Victor Stinner910337b2011-10-03 03:20:16 +02001552 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 old_length = PyUnicode_WSTR_LENGTH(unicode);
1554 else
1555 old_length = PyUnicode_GET_LENGTH(unicode);
1556 if (old_length == length)
1557 return 0;
1558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001559 if (length == 0) {
1560 Py_DECREF(*p_unicode);
1561 *p_unicode = unicode_empty;
1562 Py_INCREF(*p_unicode);
1563 return 0;
1564 }
1565
Victor Stinnerfe226c02011-10-03 03:52:20 +02001566 if (!unicode_resizable(unicode)) {
1567 PyObject *copy = resize_copy(unicode, length);
1568 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 Py_DECREF(*p_unicode);
1571 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001573 }
1574
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 if (PyUnicode_IS_COMPACT(unicode)) {
1576 *p_unicode = resize_compact(unicode, length);
1577 if (*p_unicode == NULL)
1578 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001579 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001581 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001582 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583}
1584
Alexander Belopolsky40018472011-02-26 01:02:56 +00001585int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001587{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *unicode;
1589 if (p_unicode == NULL) {
1590 PyErr_BadInternalCall();
1591 return -1;
1592 }
1593 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001594 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001600}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001603unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604{
1605 PyObject *result;
1606 assert(PyUnicode_IS_READY(*p_unicode));
1607 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1608 return 0;
1609 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1610 maxchar);
1611 if (result == NULL)
1612 return -1;
1613 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1614 PyUnicode_GET_LENGTH(*p_unicode));
1615 Py_DECREF(*p_unicode);
1616 *p_unicode = result;
1617 return 0;
1618}
1619
1620static int
1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1622 Py_UCS4 ch)
1623{
1624 if (unicode_widen(p_unicode, ch) < 0)
1625 return -1;
1626 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1627 PyUnicode_DATA(*p_unicode),
1628 (*pos)++, ch);
1629 return 0;
1630}
1631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632static PyObject*
1633get_latin1_char(unsigned char ch)
1634{
Victor Stinnera464fc12011-10-02 20:39:30 +02001635 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001637 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!unicode)
1639 return NULL;
1640 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001641 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 unicode_latin1[ch] = unicode;
1643 }
1644 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646}
1647
Alexander Belopolsky40018472011-02-26 01:02:56 +00001648PyObject *
1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001651 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 Py_UCS4 maxchar = 0;
1653 Py_ssize_t num_surrogates;
1654
1655 if (u == NULL)
1656 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 /* If the Unicode data is known at construction time, we can apply
1659 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 /* Optimization for empty strings */
1662 if (size == 0 && unicode_empty != NULL) {
1663 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001664 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001665 }
Tim Petersced69f82003-09-16 20:30:58 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Single character Unicode objects in the Latin-1 range are
1668 shared when using this constructor */
1669 if (size == 1 && *u < 256)
1670 return get_latin1_char((unsigned char)*u);
1671
1672 /* If not empty and not single character, copy the Unicode data
1673 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 if (find_maxchar_surrogates(u, u + size,
1675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return NULL;
1677
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!unicode)
1680 return NULL;
1681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 switch (PyUnicode_KIND(unicode)) {
1683 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688#if Py_UNICODE_SIZE == 2
1689 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1690#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001691 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1693#endif
1694 break;
1695 case PyUnicode_4BYTE_KIND:
1696#if SIZEOF_WCHAR_T == 2
1697 /* This is the only case which has to process surrogates, thus
1698 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001699 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700#else
1701 assert(num_surrogates == 0);
1702 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1703#endif
1704 break;
1705 default:
1706 assert(0 && "Impossible state");
1707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001709 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710}
1711
Alexander Belopolsky40018472011-02-26 01:02:56 +00001712PyObject *
1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001714{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 if (size < 0) {
1716 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 return NULL;
1719 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001720 if (u != NULL)
1721 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1722 else
1723 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001724}
1725
Alexander Belopolsky40018472011-02-26 01:02:56 +00001726PyObject *
1727PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001728{
1729 size_t size = strlen(u);
1730 if (size > PY_SSIZE_T_MAX) {
1731 PyErr_SetString(PyExc_OverflowError, "input too long");
1732 return NULL;
1733 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001734 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001735}
1736
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001737PyObject *
1738_PyUnicode_FromId(_Py_Identifier *id)
1739{
1740 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001741 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1742 strlen(id->string),
1743 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001744 if (!id->object)
1745 return NULL;
1746 PyUnicode_InternInPlace(&id->object);
1747 assert(!id->next);
1748 id->next = static_strings;
1749 static_strings = id;
1750 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001751 return id->object;
1752}
1753
1754void
1755_PyUnicode_ClearStaticStrings()
1756{
1757 _Py_Identifier *i;
1758 for (i = static_strings; i; i = i->next) {
1759 Py_DECREF(i->object);
1760 i->object = NULL;
1761 i->next = NULL;
1762 }
1763}
1764
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001765/* Internal function, don't check maximum character */
1766
Victor Stinnere57b1c02011-09-28 22:20:48 +02001767static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001768unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001769{
Victor Stinner785938e2011-12-11 20:09:03 +01001770 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001771 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001772#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001773 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001775 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001776 }
Victor Stinner785938e2011-12-11 20:09:03 +01001777 unicode = PyUnicode_New(size, 127);
1778 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001779 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001780 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1781 assert(_PyUnicode_CheckConsistency(unicode, 1));
1782 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001783}
1784
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001785static Py_UCS4
1786kind_maxchar_limit(unsigned int kind)
1787{
1788 switch(kind) {
1789 case PyUnicode_1BYTE_KIND:
1790 return 0x80;
1791 case PyUnicode_2BYTE_KIND:
1792 return 0x100;
1793 case PyUnicode_4BYTE_KIND:
1794 return 0x10000;
1795 default:
1796 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001797 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001798 }
1799}
1800
Victor Stinner702c7342011-10-05 13:50:52 +02001801static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001802_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001805 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001807 if (size == 0) {
1808 Py_INCREF(unicode_empty);
1809 return unicode_empty;
1810 }
1811 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001812 if (size == 1)
1813 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001814
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001815 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001816 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 if (!res)
1818 return NULL;
1819 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001820 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001822}
1823
Victor Stinnere57b1c02011-09-28 22:20:48 +02001824static PyObject*
1825_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826{
1827 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001828 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001829
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001830 if (size == 0) {
1831 Py_INCREF(unicode_empty);
1832 return unicode_empty;
1833 }
1834 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001835 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001836 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001837
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001838 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001839 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 if (!res)
1841 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001842 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001843 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 else {
1845 _PyUnicode_CONVERT_BYTES(
1846 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1847 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001848 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 return res;
1850}
1851
Victor Stinnere57b1c02011-09-28 22:20:48 +02001852static PyObject*
1853_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001854{
1855 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001856 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001857
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001858 if (size == 0) {
1859 Py_INCREF(unicode_empty);
1860 return unicode_empty;
1861 }
1862 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001863 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 return get_latin1_char((unsigned char)u[0]);
1865
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001866 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001867 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 if (!res)
1869 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001870 if (max_char < 256)
1871 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1872 PyUnicode_1BYTE_DATA(res));
1873 else if (max_char < 0x10000)
1874 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1875 PyUnicode_2BYTE_DATA(res));
1876 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001878 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 return res;
1880}
1881
1882PyObject*
1883PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1884{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001885 if (size < 0) {
1886 PyErr_SetString(PyExc_ValueError, "size must be positive");
1887 return NULL;
1888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001889 switch(kind) {
1890 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001891 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001893 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001895 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001896 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001897 PyErr_SetString(PyExc_SystemError, "invalid kind");
1898 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900}
1901
Victor Stinner25a4b292011-10-06 12:31:55 +02001902/* Ensure that a string uses the most efficient storage, if it is not the
1903 case: create a new string with of the right kind. Write NULL into *p_unicode
1904 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001905static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001906unicode_adjust_maxchar(PyObject **p_unicode)
1907{
1908 PyObject *unicode, *copy;
1909 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001910 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001911 unsigned int kind;
1912
1913 assert(p_unicode != NULL);
1914 unicode = *p_unicode;
1915 assert(PyUnicode_IS_READY(unicode));
1916 if (PyUnicode_IS_ASCII(unicode))
1917 return;
1918
1919 len = PyUnicode_GET_LENGTH(unicode);
1920 kind = PyUnicode_KIND(unicode);
1921 if (kind == PyUnicode_1BYTE_KIND) {
1922 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001923 max_char = ucs1lib_find_max_char(u, u + len);
1924 if (max_char >= 128)
1925 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001926 }
1927 else if (kind == PyUnicode_2BYTE_KIND) {
1928 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs2lib_find_max_char(u, u + len);
1930 if (max_char >= 256)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001935 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 max_char = ucs4lib_find_max_char(u, u + len);
1937 if (max_char >= 0x10000)
1938 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001939 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001940 copy = PyUnicode_New(len, max_char);
1941 copy_characters(copy, 0, unicode, 0, len);
1942 Py_DECREF(unicode);
1943 *p_unicode = copy;
1944}
1945
Victor Stinner034f6cf2011-09-30 02:26:44 +02001946PyObject*
1947PyUnicode_Copy(PyObject *unicode)
1948{
Victor Stinner87af4f22011-11-21 23:03:47 +01001949 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001950 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952 if (!PyUnicode_Check(unicode)) {
1953 PyErr_BadInternalCall();
1954 return NULL;
1955 }
1956 if (PyUnicode_READY(unicode))
1957 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001958
Victor Stinner87af4f22011-11-21 23:03:47 +01001959 length = PyUnicode_GET_LENGTH(unicode);
1960 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001961 if (!copy)
1962 return NULL;
1963 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1966 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001967 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001968 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001969}
1970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971
Victor Stinnerbc603d12011-10-02 01:00:40 +02001972/* Widen Unicode objects to larger buffers. Don't write terminating null
1973 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974
1975void*
1976_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1977{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978 Py_ssize_t len;
1979 void *result;
1980 unsigned int skind;
1981
1982 if (PyUnicode_READY(s))
1983 return NULL;
1984
1985 len = PyUnicode_GET_LENGTH(s);
1986 skind = PyUnicode_KIND(s);
1987 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001988 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 return NULL;
1990 }
1991 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992 case PyUnicode_2BYTE_KIND:
1993 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1994 if (!result)
1995 return PyErr_NoMemory();
1996 assert(skind == PyUnicode_1BYTE_KIND);
1997 _PyUnicode_CONVERT_BYTES(
1998 Py_UCS1, Py_UCS2,
1999 PyUnicode_1BYTE_DATA(s),
2000 PyUnicode_1BYTE_DATA(s) + len,
2001 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002003 case PyUnicode_4BYTE_KIND:
2004 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2005 if (!result)
2006 return PyErr_NoMemory();
2007 if (skind == PyUnicode_2BYTE_KIND) {
2008 _PyUnicode_CONVERT_BYTES(
2009 Py_UCS2, Py_UCS4,
2010 PyUnicode_2BYTE_DATA(s),
2011 PyUnicode_2BYTE_DATA(s) + len,
2012 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002014 else {
2015 assert(skind == PyUnicode_1BYTE_KIND);
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS1, Py_UCS4,
2018 PyUnicode_1BYTE_DATA(s),
2019 PyUnicode_1BYTE_DATA(s) + len,
2020 result);
2021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002023 default:
2024 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 }
Victor Stinner01698042011-10-04 00:04:26 +02002026 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 return NULL;
2028}
2029
2030static Py_UCS4*
2031as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2032 int copy_null)
2033{
2034 int kind;
2035 void *data;
2036 Py_ssize_t len, targetlen;
2037 if (PyUnicode_READY(string) == -1)
2038 return NULL;
2039 kind = PyUnicode_KIND(string);
2040 data = PyUnicode_DATA(string);
2041 len = PyUnicode_GET_LENGTH(string);
2042 targetlen = len;
2043 if (copy_null)
2044 targetlen++;
2045 if (!target) {
2046 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2047 PyErr_NoMemory();
2048 return NULL;
2049 }
2050 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2051 if (!target) {
2052 PyErr_NoMemory();
2053 return NULL;
2054 }
2055 }
2056 else {
2057 if (targetsize < targetlen) {
2058 PyErr_Format(PyExc_SystemError,
2059 "string is longer than the buffer");
2060 if (copy_null && 0 < targetsize)
2061 target[0] = 0;
2062 return NULL;
2063 }
2064 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002065 if (kind == PyUnicode_1BYTE_KIND) {
2066 Py_UCS1 *start = (Py_UCS1 *) data;
2067 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002069 else if (kind == PyUnicode_2BYTE_KIND) {
2070 Py_UCS2 *start = (Py_UCS2 *) data;
2071 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2072 }
2073 else {
2074 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002075 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 if (copy_null)
2078 target[len] = 0;
2079 return target;
2080}
2081
2082Py_UCS4*
2083PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2084 int copy_null)
2085{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002086 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 PyErr_BadInternalCall();
2088 return NULL;
2089 }
2090 return as_ucs4(string, target, targetsize, copy_null);
2091}
2092
2093Py_UCS4*
2094PyUnicode_AsUCS4Copy(PyObject *string)
2095{
2096 return as_ucs4(string, NULL, 0, 1);
2097}
2098
2099#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002100
Alexander Belopolsky40018472011-02-26 01:02:56 +00002101PyObject *
2102PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002105 if (size == 0) {
2106 Py_INCREF(unicode_empty);
2107 return unicode_empty;
2108 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 PyErr_BadInternalCall();
2110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 }
2112
Martin v. Löwis790465f2008-04-05 20:41:37 +00002113 if (size == -1) {
2114 size = wcslen(w);
2115 }
2116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118}
2119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002120#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002121
Walter Dörwald346737f2007-05-31 10:44:43 +00002122static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002123makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2124 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002125{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002126 *fmt++ = '%';
2127 if (width) {
2128 if (zeropad)
2129 *fmt++ = '0';
2130 fmt += sprintf(fmt, "%d", width);
2131 }
2132 if (precision)
2133 fmt += sprintf(fmt, ".%d", precision);
2134 if (longflag)
2135 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002136 else if (longlongflag) {
2137 /* longlongflag should only ever be nonzero on machines with
2138 HAVE_LONG_LONG defined */
2139#ifdef HAVE_LONG_LONG
2140 char *f = PY_FORMAT_LONG_LONG;
2141 while (*f)
2142 *fmt++ = *f++;
2143#else
2144 /* we shouldn't ever get here */
2145 assert(0);
2146 *fmt++ = 'l';
2147#endif
2148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002149 else if (size_tflag) {
2150 char *f = PY_FORMAT_SIZE_T;
2151 while (*f)
2152 *fmt++ = *f++;
2153 }
2154 *fmt++ = c;
2155 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002156}
2157
Victor Stinner96865452011-03-01 23:44:09 +00002158/* helper for PyUnicode_FromFormatV() */
2159
2160static const char*
2161parse_format_flags(const char *f,
2162 int *p_width, int *p_precision,
2163 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2164{
2165 int width, precision, longflag, longlongflag, size_tflag;
2166
2167 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2168 f++;
2169 width = 0;
2170 while (Py_ISDIGIT((unsigned)*f))
2171 width = (width*10) + *f++ - '0';
2172 precision = 0;
2173 if (*f == '.') {
2174 f++;
2175 while (Py_ISDIGIT((unsigned)*f))
2176 precision = (precision*10) + *f++ - '0';
2177 if (*f == '%') {
2178 /* "%.3%s" => f points to "3" */
2179 f--;
2180 }
2181 }
2182 if (*f == '\0') {
2183 /* bogus format "%.1" => go backward, f points to "1" */
2184 f--;
2185 }
2186 if (p_width != NULL)
2187 *p_width = width;
2188 if (p_precision != NULL)
2189 *p_precision = precision;
2190
2191 /* Handle %ld, %lu, %lld and %llu. */
2192 longflag = 0;
2193 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002194 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002195
2196 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002197 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002198 longflag = 1;
2199 ++f;
2200 }
2201#ifdef HAVE_LONG_LONG
2202 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longlongflag = 1;
2205 f += 2;
2206 }
2207#endif
2208 }
2209 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002210 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002211 size_tflag = 1;
2212 ++f;
2213 }
2214 if (p_longflag != NULL)
2215 *p_longflag = longflag;
2216 if (p_longlongflag != NULL)
2217 *p_longlongflag = longlongflag;
2218 if (p_size_tflag != NULL)
2219 *p_size_tflag = size_tflag;
2220 return f;
2221}
2222
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002223/* maximum number of characters required for output of %ld. 21 characters
2224 allows for 64-bit integers (in decimal) and an optional sign. */
2225#define MAX_LONG_CHARS 21
2226/* maximum number of characters required for output of %lld.
2227 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2228 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2229#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2230
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231PyObject *
2232PyUnicode_FromFormatV(const char *format, va_list vargs)
2233{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 va_list count;
2235 Py_ssize_t callcount = 0;
2236 PyObject **callresults = NULL;
2237 PyObject **callresult = NULL;
2238 Py_ssize_t n = 0;
2239 int width = 0;
2240 int precision = 0;
2241 int zeropad;
2242 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002243 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2247 Py_UCS4 argmaxchar;
2248 Py_ssize_t numbersize = 0;
2249 char *numberresults = NULL;
2250 char *numberresult = NULL;
2251 Py_ssize_t i;
2252 int kind;
2253 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002254
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002255 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002256 /* step 1: count the number of %S/%R/%A/%s format specifications
2257 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2258 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002260 * also estimate a upper bound for all the number formats in the string,
2261 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 for (f = format; *f; f++) {
2264 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002265 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2267 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2268 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2269 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002272#ifdef HAVE_LONG_LONG
2273 if (longlongflag) {
2274 if (width < MAX_LONG_LONG_CHARS)
2275 width = MAX_LONG_LONG_CHARS;
2276 }
2277 else
2278#endif
2279 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2280 including sign. Decimal takes the most space. This
2281 isn't enough for octal. If a width is specified we
2282 need more (which we allocate later). */
2283 if (width < MAX_LONG_CHARS)
2284 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285
2286 /* account for the size + '\0' to separate numbers
2287 inside of the numberresults buffer */
2288 numbersize += (width + 1);
2289 }
2290 }
2291 else if ((unsigned char)*f > 127) {
2292 PyErr_Format(PyExc_ValueError,
2293 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2294 "string, got a non-ASCII byte: 0x%02x",
2295 (unsigned char)*f);
2296 return NULL;
2297 }
2298 }
2299 /* step 2: allocate memory for the results of
2300 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2301 if (callcount) {
2302 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2303 if (!callresults) {
2304 PyErr_NoMemory();
2305 return NULL;
2306 }
2307 callresult = callresults;
2308 }
2309 /* step 2.5: allocate memory for the results of formating numbers */
2310 if (numbersize) {
2311 numberresults = PyObject_Malloc(numbersize);
2312 if (!numberresults) {
2313 PyErr_NoMemory();
2314 goto fail;
2315 }
2316 numberresult = numberresults;
2317 }
2318
2319 /* step 3: format numbers and figure out how large a buffer we need */
2320 for (f = format; *f; f++) {
2321 if (*f == '%') {
2322 const char* p;
2323 int longflag;
2324 int longlongflag;
2325 int size_tflag;
2326 int numprinted;
2327
2328 p = f;
2329 zeropad = (f[1] == '0');
2330 f = parse_format_flags(f, &width, &precision,
2331 &longflag, &longlongflag, &size_tflag);
2332 switch (*f) {
2333 case 'c':
2334 {
2335 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002336 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 n++;
2338 break;
2339 }
2340 case '%':
2341 n++;
2342 break;
2343 case 'i':
2344 case 'd':
2345 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2346 width, precision, *f);
2347 if (longflag)
2348 numprinted = sprintf(numberresult, fmt,
2349 va_arg(count, long));
2350#ifdef HAVE_LONG_LONG
2351 else if (longlongflag)
2352 numprinted = sprintf(numberresult, fmt,
2353 va_arg(count, PY_LONG_LONG));
2354#endif
2355 else if (size_tflag)
2356 numprinted = sprintf(numberresult, fmt,
2357 va_arg(count, Py_ssize_t));
2358 else
2359 numprinted = sprintf(numberresult, fmt,
2360 va_arg(count, int));
2361 n += numprinted;
2362 /* advance by +1 to skip over the '\0' */
2363 numberresult += (numprinted + 1);
2364 assert(*(numberresult - 1) == '\0');
2365 assert(*(numberresult - 2) != '\0');
2366 assert(numprinted >= 0);
2367 assert(numberresult <= numberresults + numbersize);
2368 break;
2369 case 'u':
2370 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2371 width, precision, 'u');
2372 if (longflag)
2373 numprinted = sprintf(numberresult, fmt,
2374 va_arg(count, unsigned long));
2375#ifdef HAVE_LONG_LONG
2376 else if (longlongflag)
2377 numprinted = sprintf(numberresult, fmt,
2378 va_arg(count, unsigned PY_LONG_LONG));
2379#endif
2380 else if (size_tflag)
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, size_t));
2383 else
2384 numprinted = sprintf(numberresult, fmt,
2385 va_arg(count, unsigned int));
2386 n += numprinted;
2387 numberresult += (numprinted + 1);
2388 assert(*(numberresult - 1) == '\0');
2389 assert(*(numberresult - 2) != '\0');
2390 assert(numprinted >= 0);
2391 assert(numberresult <= numberresults + numbersize);
2392 break;
2393 case 'x':
2394 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2395 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2396 n += numprinted;
2397 numberresult += (numprinted + 1);
2398 assert(*(numberresult - 1) == '\0');
2399 assert(*(numberresult - 2) != '\0');
2400 assert(numprinted >= 0);
2401 assert(numberresult <= numberresults + numbersize);
2402 break;
2403 case 'p':
2404 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2405 /* %p is ill-defined: ensure leading 0x. */
2406 if (numberresult[1] == 'X')
2407 numberresult[1] = 'x';
2408 else if (numberresult[1] != 'x') {
2409 memmove(numberresult + 2, numberresult,
2410 strlen(numberresult) + 1);
2411 numberresult[0] = '0';
2412 numberresult[1] = 'x';
2413 numprinted += 2;
2414 }
2415 n += numprinted;
2416 numberresult += (numprinted + 1);
2417 assert(*(numberresult - 1) == '\0');
2418 assert(*(numberresult - 2) != '\0');
2419 assert(numprinted >= 0);
2420 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002421 break;
2422 case 's':
2423 {
2424 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002425 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002426 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002427 if (!str)
2428 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 /* since PyUnicode_DecodeUTF8 returns already flexible
2430 unicode objects, there is no need to call ready on them */
2431 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002432 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002434 /* Remember the str and switch to the next slot */
2435 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002436 break;
2437 }
2438 case 'U':
2439 {
2440 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002441 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 if (PyUnicode_READY(obj) == -1)
2443 goto fail;
2444 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002445 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 break;
2448 }
2449 case 'V':
2450 {
2451 PyObject *obj = va_arg(count, PyObject *);
2452 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002453 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002455 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002456 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 if (PyUnicode_READY(obj) == -1)
2458 goto fail;
2459 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002460 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 *callresult++ = NULL;
2463 }
2464 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002465 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002466 if (!str_obj)
2467 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002468 if (PyUnicode_READY(str_obj)) {
2469 Py_DECREF(str_obj);
2470 goto fail;
2471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002473 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 *callresult++ = str_obj;
2476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 break;
2478 }
2479 case 'S':
2480 {
2481 PyObject *obj = va_arg(count, PyObject *);
2482 PyObject *str;
2483 assert(obj);
2484 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002486 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002488 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002490 /* Remember the str and switch to the next slot */
2491 *callresult++ = str;
2492 break;
2493 }
2494 case 'R':
2495 {
2496 PyObject *obj = va_arg(count, PyObject *);
2497 PyObject *repr;
2498 assert(obj);
2499 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002503 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 /* Remember the repr and switch to the next slot */
2506 *callresult++ = repr;
2507 break;
2508 }
2509 case 'A':
2510 {
2511 PyObject *obj = va_arg(count, PyObject *);
2512 PyObject *ascii;
2513 assert(obj);
2514 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002518 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 /* Remember the repr and switch to the next slot */
2521 *callresult++ = ascii;
2522 break;
2523 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 default:
2525 /* if we stumble upon an unknown
2526 formatting code, copy the rest of
2527 the format string to the output
2528 string. (we cannot just skip the
2529 code, since there's no way to know
2530 what's in the argument list) */
2531 n += strlen(p);
2532 goto expand;
2533 }
2534 } else
2535 n++;
2536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002537 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 we don't have to resize the string.
2541 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002542 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002543 if (!string)
2544 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 kind = PyUnicode_KIND(string);
2546 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002547 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002551 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002552 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002553
2554 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2556 /* checking for == because the last argument could be a empty
2557 string, which causes i to point to end, the assert at the end of
2558 the loop */
2559 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002560
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 switch (*f) {
2562 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002563 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 const int ordinal = va_arg(vargs, int);
2565 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002567 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002568 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 case 'p':
2573 /* unused, since we already have the result */
2574 if (*f == 'p')
2575 (void) va_arg(vargs, void *);
2576 else
2577 (void) va_arg(vargs, int);
2578 /* extract the result from numberresults and append. */
2579 for (; *numberresult; ++i, ++numberresult)
2580 PyUnicode_WRITE(kind, data, i, *numberresult);
2581 /* skip over the separating '\0' */
2582 assert(*numberresult == '\0');
2583 numberresult++;
2584 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 break;
2586 case 's':
2587 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002588 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002590 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 size = PyUnicode_GET_LENGTH(*callresult);
2592 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002593 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002595 /* We're done with the unicode()/repr() => forget it */
2596 Py_DECREF(*callresult);
2597 /* switch to next unicode()/repr() result */
2598 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 }
2601 case 'U':
2602 {
2603 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 Py_ssize_t size;
2605 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2606 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002607 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 break;
2610 }
2611 case 'V':
2612 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002614 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 size = PyUnicode_GET_LENGTH(obj);
2618 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002619 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 size = PyUnicode_GET_LENGTH(*callresult);
2623 assert(PyUnicode_KIND(*callresult) <=
2624 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002627 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002629 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 break;
2631 }
2632 case 'S':
2633 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002634 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 /* unused, since we already have the result */
2638 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002640 copy_characters(string, i, *callresult, 0, size);
2641 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 /* We're done with the unicode()/repr() => forget it */
2643 Py_DECREF(*callresult);
2644 /* switch to next unicode()/repr() result */
2645 ++callresult;
2646 break;
2647 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 break;
2651 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 for (; *p; ++p, ++i)
2653 PyUnicode_WRITE(kind, data, i, *p);
2654 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 goto end;
2656 }
Victor Stinner1205f272010-09-11 00:54:47 +00002657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 else {
2659 assert(i < PyUnicode_GET_LENGTH(string));
2660 PyUnicode_WRITE(kind, data, i++, *f);
2661 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002664
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 if (callresults)
2667 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 if (numberresults)
2669 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002670 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults) {
2673 PyObject **callresult2 = callresults;
2674 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002675 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 ++callresult2;
2677 }
2678 PyObject_Free(callresults);
2679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 if (numberresults)
2681 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002683}
2684
Walter Dörwaldd2034312007-05-18 16:29:38 +00002685PyObject *
2686PyUnicode_FromFormat(const char *format, ...)
2687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 PyObject* ret;
2689 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
2691#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002693#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 ret = PyUnicode_FromFormatV(format, vargs);
2697 va_end(vargs);
2698 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699}
2700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701#ifdef HAVE_WCHAR_H
2702
Victor Stinner5593d8a2010-10-02 11:11:27 +00002703/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2704 convert a Unicode object to a wide character string.
2705
Victor Stinnerd88d9832011-09-06 02:00:05 +02002706 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 character) required to convert the unicode object. Ignore size argument.
2708
Victor Stinnerd88d9832011-09-06 02:00:05 +02002709 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002710 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002711 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002713unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 wchar_t *w,
2715 Py_ssize_t size)
2716{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 const wchar_t *wstr;
2719
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002720 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 if (wstr == NULL)
2722 return -1;
2723
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 if (size > res)
2726 size = res + 1;
2727 else
2728 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 return res;
2731 }
2732 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002734}
2735
2736Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002737PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 wchar_t *w,
2739 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740{
2741 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002742 PyErr_BadInternalCall();
2743 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002745 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746}
2747
Victor Stinner137c34c2010-09-29 10:25:54 +00002748wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002749PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 Py_ssize_t *size)
2751{
2752 wchar_t* buffer;
2753 Py_ssize_t buflen;
2754
2755 if (unicode == NULL) {
2756 PyErr_BadInternalCall();
2757 return NULL;
2758 }
2759
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002760 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 if (buflen == -1)
2762 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 PyErr_NoMemory();
2765 return NULL;
2766 }
2767
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2769 if (buffer == NULL) {
2770 PyErr_NoMemory();
2771 return NULL;
2772 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002773 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 if (buflen == -1)
2775 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002776 if (size != NULL)
2777 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002778 return buffer;
2779}
2780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782
Alexander Belopolsky40018472011-02-26 01:02:56 +00002783PyObject *
2784PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002787 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002788 PyErr_SetString(PyExc_ValueError,
2789 "chr() arg not in range(0x110000)");
2790 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 if (ordinal < 256)
2794 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 v = PyUnicode_New(1, ordinal);
2797 if (v == NULL)
2798 return NULL;
2799 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002800 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802}
2803
Alexander Belopolsky40018472011-02-26 01:02:56 +00002804PyObject *
2805PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002807 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002809 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002810 if (PyUnicode_READY(obj))
2811 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 Py_INCREF(obj);
2813 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002814 }
2815 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002816 /* For a Unicode subtype that's not a Unicode object,
2817 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002818 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002819 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002820 PyErr_Format(PyExc_TypeError,
2821 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002822 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002823 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002824}
2825
Alexander Belopolsky40018472011-02-26 01:02:56 +00002826PyObject *
2827PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002828 const char *encoding,
2829 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002831 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002832 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002833
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 PyErr_BadInternalCall();
2836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002839 /* Decoding bytes objects is the most common case and should be fast */
2840 if (PyBytes_Check(obj)) {
2841 if (PyBytes_GET_SIZE(obj) == 0) {
2842 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002843 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002844 }
2845 else {
2846 v = PyUnicode_Decode(
2847 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2848 encoding, errors);
2849 }
2850 return v;
2851 }
2852
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002853 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 PyErr_SetString(PyExc_TypeError,
2855 "decoding str is not supported");
2856 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002857 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2860 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2861 PyErr_Format(PyExc_TypeError,
2862 "coercing to str: need bytes, bytearray "
2863 "or buffer-like object, %.80s found",
2864 Py_TYPE(obj)->tp_name);
2865 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002866 }
Tim Petersced69f82003-09-16 20:30:58 +00002867
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002868 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002870 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 }
Tim Petersced69f82003-09-16 20:30:58 +00002872 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002873 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002874
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002875 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877}
2878
Victor Stinner600d3be2010-06-10 12:00:55 +00002879/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002880 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2881 1 on success. */
2882static int
2883normalize_encoding(const char *encoding,
2884 char *lower,
2885 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002887 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002888 char *l;
2889 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002890
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002891 if (encoding == NULL) {
2892 strcpy(lower, "utf-8");
2893 return 1;
2894 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002895 e = encoding;
2896 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002897 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002898 while (*e) {
2899 if (l == l_end)
2900 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002901 if (Py_ISUPPER(*e)) {
2902 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002903 }
2904 else if (*e == '_') {
2905 *l++ = '-';
2906 e++;
2907 }
2908 else {
2909 *l++ = *e++;
2910 }
2911 }
2912 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002913 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914}
2915
Alexander Belopolsky40018472011-02-26 01:02:56 +00002916PyObject *
2917PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002918 Py_ssize_t size,
2919 const char *encoding,
2920 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002921{
2922 PyObject *buffer = NULL, *unicode;
2923 Py_buffer info;
2924 char lower[11]; /* Enough for any encoding shortcut */
2925
Fred Drakee4315f52000-05-09 19:53:39 +00002926 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002927 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002928 if ((strcmp(lower, "utf-8") == 0) ||
2929 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002930 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002931 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002932 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002933 (strcmp(lower, "iso-8859-1") == 0))
2934 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002935#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002936 else if (strcmp(lower, "mbcs") == 0)
2937 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002938#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002939 else if (strcmp(lower, "ascii") == 0)
2940 return PyUnicode_DecodeASCII(s, size, errors);
2941 else if (strcmp(lower, "utf-16") == 0)
2942 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2943 else if (strcmp(lower, "utf-32") == 0)
2944 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946
2947 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002948 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002949 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002950 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002951 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 if (buffer == NULL)
2953 goto onError;
2954 unicode = PyCodec_Decode(buffer, encoding, errors);
2955 if (unicode == NULL)
2956 goto onError;
2957 if (!PyUnicode_Check(unicode)) {
2958 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002959 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002960 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 Py_DECREF(unicode);
2962 goto onError;
2963 }
2964 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002965 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002966
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968 Py_XDECREF(buffer);
2969 return NULL;
2970}
2971
Alexander Belopolsky40018472011-02-26 01:02:56 +00002972PyObject *
2973PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002974 const char *encoding,
2975 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002976{
2977 PyObject *v;
2978
2979 if (!PyUnicode_Check(unicode)) {
2980 PyErr_BadArgument();
2981 goto onError;
2982 }
2983
2984 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002986
2987 /* Decode via the codec registry */
2988 v = PyCodec_Decode(unicode, encoding, errors);
2989 if (v == NULL)
2990 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002991 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994 return NULL;
2995}
2996
Alexander Belopolsky40018472011-02-26 01:02:56 +00002997PyObject *
2998PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002999 const char *encoding,
3000 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003001{
3002 PyObject *v;
3003
3004 if (!PyUnicode_Check(unicode)) {
3005 PyErr_BadArgument();
3006 goto onError;
3007 }
3008
3009 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003011
3012 /* Decode via the codec registry */
3013 v = PyCodec_Decode(unicode, encoding, errors);
3014 if (v == NULL)
3015 goto onError;
3016 if (!PyUnicode_Check(v)) {
3017 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003018 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019 Py_TYPE(v)->tp_name);
3020 Py_DECREF(v);
3021 goto onError;
3022 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003023 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003024
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003026 return NULL;
3027}
3028
Alexander Belopolsky40018472011-02-26 01:02:56 +00003029PyObject *
3030PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003031 Py_ssize_t size,
3032 const char *encoding,
3033 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034{
3035 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003036
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 unicode = PyUnicode_FromUnicode(s, size);
3038 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3041 Py_DECREF(unicode);
3042 return v;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 const char *encoding,
3048 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003049{
3050 PyObject *v;
3051
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_BadArgument();
3054 goto onError;
3055 }
3056
3057 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003059
3060 /* Encode via the codec registry */
3061 v = PyCodec_Encode(unicode, encoding, errors);
3062 if (v == NULL)
3063 goto onError;
3064 return v;
3065
Benjamin Peterson29060642009-01-31 22:14:21 +00003066 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067 return NULL;
3068}
3069
Victor Stinnerad158722010-10-27 00:25:46 +00003070PyObject *
3071PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003072{
Victor Stinner99b95382011-07-04 14:23:54 +02003073#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003074 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003075#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003076 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003077#else
Victor Stinner793b5312011-04-27 00:24:21 +02003078 PyInterpreterState *interp = PyThreadState_GET()->interp;
3079 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3080 cannot use it to encode and decode filenames before it is loaded. Load
3081 the Python codec requires to encode at least its own filename. Use the C
3082 version of the locale codec until the codec registry is initialized and
3083 the Python codec is loaded.
3084
3085 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3086 cannot only rely on it: check also interp->fscodec_initialized for
3087 subinterpreters. */
3088 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003089 return PyUnicode_AsEncodedString(unicode,
3090 Py_FileSystemDefaultEncoding,
3091 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003092 }
3093 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003094 /* locale encoding with surrogateescape */
3095 wchar_t *wchar;
3096 char *bytes;
3097 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003098 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003099
3100 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3101 if (wchar == NULL)
3102 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003103 bytes = _Py_wchar2char(wchar, &error_pos);
3104 if (bytes == NULL) {
3105 if (error_pos != (size_t)-1) {
3106 char *errmsg = strerror(errno);
3107 PyObject *exc = NULL;
3108 if (errmsg == NULL)
3109 errmsg = "Py_wchar2char() failed";
3110 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003111 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003112 error_pos, error_pos+1,
3113 errmsg);
3114 Py_XDECREF(exc);
3115 }
3116 else
3117 PyErr_NoMemory();
3118 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003119 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 }
3121 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003122
3123 bytes_obj = PyBytes_FromString(bytes);
3124 PyMem_Free(bytes);
3125 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003126 }
Victor Stinnerad158722010-10-27 00:25:46 +00003127#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003128}
3129
Alexander Belopolsky40018472011-02-26 01:02:56 +00003130PyObject *
3131PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003132 const char *encoding,
3133 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134{
3135 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003136 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003137
Guido van Rossumd57fd912000-03-10 22:53:23 +00003138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141 }
Fred Drakee4315f52000-05-09 19:53:39 +00003142
Fred Drakee4315f52000-05-09 19:53:39 +00003143 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003144 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003145 if ((strcmp(lower, "utf-8") == 0) ||
3146 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003147 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003148 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003149 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003150 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003151 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003152 }
Victor Stinner37296e82010-06-10 13:36:23 +00003153 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003154 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003155 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003156 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003157#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003158 else if (strcmp(lower, "mbcs") == 0)
3159 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003160#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003161 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
3165 /* Encode via the codec registry */
3166 v = PyCodec_Encode(unicode, encoding, errors);
3167 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003168 return NULL;
3169
3170 /* The normal path */
3171 if (PyBytes_Check(v))
3172 return v;
3173
3174 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003175 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003176 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003177 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003178
3179 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3180 "encoder %s returned bytearray instead of bytes",
3181 encoding);
3182 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003183 Py_DECREF(v);
3184 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003187 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3188 Py_DECREF(v);
3189 return b;
3190 }
3191
3192 PyErr_Format(PyExc_TypeError,
3193 "encoder did not return a bytes object (type=%.400s)",
3194 Py_TYPE(v)->tp_name);
3195 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196 return NULL;
3197}
3198
Alexander Belopolsky40018472011-02-26 01:02:56 +00003199PyObject *
3200PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003201 const char *encoding,
3202 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203{
3204 PyObject *v;
3205
3206 if (!PyUnicode_Check(unicode)) {
3207 PyErr_BadArgument();
3208 goto onError;
3209 }
3210
3211 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003213
3214 /* Encode via the codec registry */
3215 v = PyCodec_Encode(unicode, encoding, errors);
3216 if (v == NULL)
3217 goto onError;
3218 if (!PyUnicode_Check(v)) {
3219 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003220 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003221 Py_TYPE(v)->tp_name);
3222 Py_DECREF(v);
3223 goto onError;
3224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003226
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 return NULL;
3229}
3230
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003231PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003232PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003233 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003234 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3235}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003236
Christian Heimes5894ba72007-11-04 11:43:14 +00003237PyObject*
3238PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3239{
Victor Stinner99b95382011-07-04 14:23:54 +02003240#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003241 return PyUnicode_DecodeMBCS(s, size, NULL);
3242#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003243 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003244#else
Victor Stinner793b5312011-04-27 00:24:21 +02003245 PyInterpreterState *interp = PyThreadState_GET()->interp;
3246 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3247 cannot use it to encode and decode filenames before it is loaded. Load
3248 the Python codec requires to encode at least its own filename. Use the C
3249 version of the locale codec until the codec registry is initialized and
3250 the Python codec is loaded.
3251
3252 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3253 cannot only rely on it: check also interp->fscodec_initialized for
3254 subinterpreters. */
3255 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003256 return PyUnicode_Decode(s, size,
3257 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003258 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003259 }
3260 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003261 /* locale encoding with surrogateescape */
3262 wchar_t *wchar;
3263 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003264 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003265
3266 if (s[size] != '\0' || size != strlen(s)) {
3267 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3268 return NULL;
3269 }
3270
Victor Stinner168e1172010-10-16 23:16:16 +00003271 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003272 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003273 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003274
Victor Stinner168e1172010-10-16 23:16:16 +00003275 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003276 PyMem_Free(wchar);
3277 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278 }
Victor Stinnerad158722010-10-27 00:25:46 +00003279#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003280}
3281
Martin v. Löwis011e8422009-05-05 04:43:17 +00003282
3283int
3284PyUnicode_FSConverter(PyObject* arg, void* addr)
3285{
3286 PyObject *output = NULL;
3287 Py_ssize_t size;
3288 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003289 if (arg == NULL) {
3290 Py_DECREF(*(PyObject**)addr);
3291 return 1;
3292 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003293 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003294 output = arg;
3295 Py_INCREF(output);
3296 }
3297 else {
3298 arg = PyUnicode_FromObject(arg);
3299 if (!arg)
3300 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003301 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003302 Py_DECREF(arg);
3303 if (!output)
3304 return 0;
3305 if (!PyBytes_Check(output)) {
3306 Py_DECREF(output);
3307 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3308 return 0;
3309 }
3310 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003311 size = PyBytes_GET_SIZE(output);
3312 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003313 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003314 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003315 Py_DECREF(output);
3316 return 0;
3317 }
3318 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003319 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003320}
3321
3322
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003323int
3324PyUnicode_FSDecoder(PyObject* arg, void* addr)
3325{
3326 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003327 if (arg == NULL) {
3328 Py_DECREF(*(PyObject**)addr);
3329 return 1;
3330 }
3331 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332 if (PyUnicode_READY(arg))
3333 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003334 output = arg;
3335 Py_INCREF(output);
3336 }
3337 else {
3338 arg = PyBytes_FromObject(arg);
3339 if (!arg)
3340 return 0;
3341 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3342 PyBytes_GET_SIZE(arg));
3343 Py_DECREF(arg);
3344 if (!output)
3345 return 0;
3346 if (!PyUnicode_Check(output)) {
3347 Py_DECREF(output);
3348 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3349 return 0;
3350 }
3351 }
Victor Stinner065836e2011-10-27 01:56:33 +02003352 if (PyUnicode_READY(output) < 0) {
3353 Py_DECREF(output);
3354 return 0;
3355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003357 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003358 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3359 Py_DECREF(output);
3360 return 0;
3361 }
3362 *(PyObject**)addr = output;
3363 return Py_CLEANUP_SUPPORTED;
3364}
3365
3366
Martin v. Löwis5b222132007-06-10 09:51:05 +00003367char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003368PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003369{
Christian Heimesf3863112007-11-22 07:46:41 +00003370 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003371
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003372 if (!PyUnicode_Check(unicode)) {
3373 PyErr_BadArgument();
3374 return NULL;
3375 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003376 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003377 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003378
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003379 if (PyUnicode_UTF8(unicode) == NULL) {
3380 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003381 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3382 if (bytes == NULL)
3383 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003384 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3385 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386 Py_DECREF(bytes);
3387 return NULL;
3388 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003389 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3390 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3391 PyBytes_AS_STRING(bytes),
3392 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393 Py_DECREF(bytes);
3394 }
3395
3396 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003397 *psize = PyUnicode_UTF8_LENGTH(unicode);
3398 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003399}
3400
3401char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003402PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003404 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3405}
3406
3407#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003408static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003409#endif
3410
3411
3412Py_UNICODE *
3413PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 const unsigned char *one_byte;
3416#if SIZEOF_WCHAR_T == 4
3417 const Py_UCS2 *two_bytes;
3418#else
3419 const Py_UCS4 *four_bytes;
3420 const Py_UCS4 *ucs4_end;
3421 Py_ssize_t num_surrogates;
3422#endif
3423 wchar_t *w;
3424 wchar_t *wchar_end;
3425
3426 if (!PyUnicode_Check(unicode)) {
3427 PyErr_BadArgument();
3428 return NULL;
3429 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003430 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003432 assert(_PyUnicode_KIND(unicode) != 0);
3433 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003434
3435#ifdef Py_DEBUG
3436 ++unicode_as_unicode_calls;
3437#endif
3438
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003439 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003440#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003441 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3442 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003443 num_surrogates = 0;
3444
3445 for (; four_bytes < ucs4_end; ++four_bytes) {
3446 if (*four_bytes > 0xFFFF)
3447 ++num_surrogates;
3448 }
3449
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3451 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3452 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 PyErr_NoMemory();
3454 return NULL;
3455 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003458 w = _PyUnicode_WSTR(unicode);
3459 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3460 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3462 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003463 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003464 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003465 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3466 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003467 }
3468 else
3469 *w = *four_bytes;
3470
3471 if (w > wchar_end) {
3472 assert(0 && "Miscalculated string end");
3473 }
3474 }
3475 *w = 0;
3476#else
3477 /* sizeof(wchar_t) == 4 */
3478 Py_FatalError("Impossible unicode object state, wstr and str "
3479 "should share memory already.");
3480 return NULL;
3481#endif
3482 }
3483 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003484 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3485 (_PyUnicode_LENGTH(unicode) + 1));
3486 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003487 PyErr_NoMemory();
3488 return NULL;
3489 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003490 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3491 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3492 w = _PyUnicode_WSTR(unicode);
3493 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003494
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003495 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3496 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497 for (; w < wchar_end; ++one_byte, ++w)
3498 *w = *one_byte;
3499 /* null-terminate the wstr */
3500 *w = 0;
3501 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003502 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003504 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 for (; w < wchar_end; ++two_bytes, ++w)
3506 *w = *two_bytes;
3507 /* null-terminate the wstr */
3508 *w = 0;
3509#else
3510 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003511 PyObject_FREE(_PyUnicode_WSTR(unicode));
3512 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513 Py_FatalError("Impossible unicode object state, wstr "
3514 "and str should share memory already.");
3515 return NULL;
3516#endif
3517 }
3518 else {
3519 assert(0 && "This should never happen.");
3520 }
3521 }
3522 }
3523 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003524 *size = PyUnicode_WSTR_LENGTH(unicode);
3525 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003526}
3527
Alexander Belopolsky40018472011-02-26 01:02:56 +00003528Py_UNICODE *
3529PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532}
3533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534
Alexander Belopolsky40018472011-02-26 01:02:56 +00003535Py_ssize_t
3536PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537{
3538 if (!PyUnicode_Check(unicode)) {
3539 PyErr_BadArgument();
3540 goto onError;
3541 }
3542 return PyUnicode_GET_SIZE(unicode);
3543
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 return -1;
3546}
3547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003548Py_ssize_t
3549PyUnicode_GetLength(PyObject *unicode)
3550{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003551 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 PyErr_BadArgument();
3553 return -1;
3554 }
3555
3556 return PyUnicode_GET_LENGTH(unicode);
3557}
3558
3559Py_UCS4
3560PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3561{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003562 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3563 PyErr_BadArgument();
3564 return (Py_UCS4)-1;
3565 }
3566 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3567 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568 return (Py_UCS4)-1;
3569 }
3570 return PyUnicode_READ_CHAR(unicode, index);
3571}
3572
3573int
3574PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3575{
3576 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003577 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003578 return -1;
3579 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003580 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3581 PyErr_SetString(PyExc_IndexError, "string index out of range");
3582 return -1;
3583 }
3584 if (_PyUnicode_Dirty(unicode))
3585 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003586 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3587 index, ch);
3588 return 0;
3589}
3590
Alexander Belopolsky40018472011-02-26 01:02:56 +00003591const char *
3592PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003593{
Victor Stinner42cb4622010-09-01 19:39:01 +00003594 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003595}
3596
Victor Stinner554f3f02010-06-16 23:33:54 +00003597/* create or adjust a UnicodeDecodeError */
3598static void
3599make_decode_exception(PyObject **exceptionObject,
3600 const char *encoding,
3601 const char *input, Py_ssize_t length,
3602 Py_ssize_t startpos, Py_ssize_t endpos,
3603 const char *reason)
3604{
3605 if (*exceptionObject == NULL) {
3606 *exceptionObject = PyUnicodeDecodeError_Create(
3607 encoding, input, length, startpos, endpos, reason);
3608 }
3609 else {
3610 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3611 goto onError;
3612 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3613 goto onError;
3614 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3615 goto onError;
3616 }
3617 return;
3618
3619onError:
3620 Py_DECREF(*exceptionObject);
3621 *exceptionObject = NULL;
3622}
3623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624/* error handling callback helper:
3625 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003626 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003627 and adjust various state variables.
3628 return 0 on success, -1 on error
3629*/
3630
Alexander Belopolsky40018472011-02-26 01:02:56 +00003631static int
3632unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003633 const char *encoding, const char *reason,
3634 const char **input, const char **inend, Py_ssize_t *startinpos,
3635 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003636 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003638 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639
3640 PyObject *restuple = NULL;
3641 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003642 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003643 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003644 Py_ssize_t requiredsize;
3645 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003646 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 int res = -1;
3648
Victor Stinner596a6c42011-11-09 00:02:18 +01003649 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3650 outsize = PyUnicode_GET_LENGTH(*output);
3651 else
3652 outsize = _PyUnicode_WSTR_LENGTH(*output);
3653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 *errorHandler = PyCodec_LookupError(errors);
3656 if (*errorHandler == NULL)
3657 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 }
3659
Victor Stinner554f3f02010-06-16 23:33:54 +00003660 make_decode_exception(exceptionObject,
3661 encoding,
3662 *input, *inend - *input,
3663 *startinpos, *endinpos,
3664 reason);
3665 if (*exceptionObject == NULL)
3666 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667
3668 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3669 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003672 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 }
3675 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003677 if (PyUnicode_READY(repunicode) < 0)
3678 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003679
3680 /* Copy back the bytes variables, which might have been modified by the
3681 callback */
3682 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3683 if (!inputobj)
3684 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003685 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003687 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003688 *input = PyBytes_AS_STRING(inputobj);
3689 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003690 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003691 /* we can DECREF safely, as the exception has another reference,
3692 so the object won't go away. */
3693 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003694
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003697 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003698 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3699 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003700 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003701
Victor Stinner596a6c42011-11-09 00:02:18 +01003702 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3703 /* need more space? (at least enough for what we
3704 have+the replacement+the rest of the string (starting
3705 at the new input position), so we won't have to check space
3706 when there are no errors in the rest of the string) */
3707 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3708 requiredsize = *outpos + replen + insize-newpos;
3709 if (requiredsize > outsize) {
3710 if (requiredsize<2*outsize)
3711 requiredsize = 2*outsize;
3712 if (unicode_resize(output, requiredsize) < 0)
3713 goto onError;
3714 }
3715 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003717 copy_characters(*output, *outpos, repunicode, 0, replen);
3718 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003720 else {
3721 wchar_t *repwstr;
3722 Py_ssize_t repwlen;
3723 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3724 if (repwstr == NULL)
3725 goto onError;
3726 /* need more space? (at least enough for what we
3727 have+the replacement+the rest of the string (starting
3728 at the new input position), so we won't have to check space
3729 when there are no errors in the rest of the string) */
3730 requiredsize = *outpos + repwlen + insize-newpos;
3731 if (requiredsize > outsize) {
3732 if (requiredsize < 2*outsize)
3733 requiredsize = 2*outsize;
3734 if (unicode_resize(output, requiredsize) < 0)
3735 goto onError;
3736 }
3737 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3738 *outpos += repwlen;
3739 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003741 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003742
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 /* we made it! */
3744 res = 0;
3745
Benjamin Peterson29060642009-01-31 22:14:21 +00003746 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 Py_XDECREF(restuple);
3748 return res;
3749}
3750
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003751/* --- UTF-7 Codec -------------------------------------------------------- */
3752
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753/* See RFC2152 for details. We encode conservatively and decode liberally. */
3754
3755/* Three simple macros defining base-64. */
3756
3757/* Is c a base-64 character? */
3758
3759#define IS_BASE64(c) \
3760 (((c) >= 'A' && (c) <= 'Z') || \
3761 ((c) >= 'a' && (c) <= 'z') || \
3762 ((c) >= '0' && (c) <= '9') || \
3763 (c) == '+' || (c) == '/')
3764
3765/* given that c is a base-64 character, what is its base-64 value? */
3766
3767#define FROM_BASE64(c) \
3768 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3769 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3770 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3771 (c) == '+' ? 62 : 63)
3772
3773/* What is the base-64 character of the bottom 6 bits of n? */
3774
3775#define TO_BASE64(n) \
3776 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3777
3778/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3779 * decoded as itself. We are permissive on decoding; the only ASCII
3780 * byte not decoding to itself is the + which begins a base64
3781 * string. */
3782
3783#define DECODE_DIRECT(c) \
3784 ((c) <= 127 && (c) != '+')
3785
3786/* The UTF-7 encoder treats ASCII characters differently according to
3787 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3788 * the above). See RFC2152. This array identifies these different
3789 * sets:
3790 * 0 : "Set D"
3791 * alphanumeric and '(),-./:?
3792 * 1 : "Set O"
3793 * !"#$%&*;<=>@[]^_`{|}
3794 * 2 : "whitespace"
3795 * ht nl cr sp
3796 * 3 : special (must be base64 encoded)
3797 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3798 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003799
Tim Petersced69f82003-09-16 20:30:58 +00003800static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003801char utf7_category[128] = {
3802/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3803 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3804/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3805 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3806/* sp ! " # $ % & ' ( ) * + , - . / */
3807 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3808/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3809 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3810/* @ A B C D E F G H I J K L M N O */
3811 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3812/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3814/* ` a b c d e f g h i j k l m n o */
3815 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3816/* p q r s t u v w x y z { | } ~ del */
3817 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818};
3819
Antoine Pitrou244651a2009-05-04 18:56:13 +00003820/* ENCODE_DIRECT: this character should be encoded as itself. The
3821 * answer depends on whether we are encoding set O as itself, and also
3822 * on whether we are encoding whitespace as itself. RFC2152 makes it
3823 * clear that the answers to these questions vary between
3824 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003825
Antoine Pitrou244651a2009-05-04 18:56:13 +00003826#define ENCODE_DIRECT(c, directO, directWS) \
3827 ((c) < 128 && (c) > 0 && \
3828 ((utf7_category[(c)] == 0) || \
3829 (directWS && (utf7_category[(c)] == 2)) || \
3830 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832PyObject *
3833PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003834 Py_ssize_t size,
3835 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003837 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3838}
3839
Antoine Pitrou244651a2009-05-04 18:56:13 +00003840/* The decoder. The only state we preserve is our read position,
3841 * i.e. how many characters we have consumed. So if we end in the
3842 * middle of a shift sequence we have to back off the read position
3843 * and the output to the beginning of the sequence, otherwise we lose
3844 * all the shift state (seen bits, number of bits seen, high
3845 * surrogate). */
3846
Alexander Belopolsky40018472011-02-26 01:02:56 +00003847PyObject *
3848PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003849 Py_ssize_t size,
3850 const char *errors,
3851 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003852{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003854 Py_ssize_t startinpos;
3855 Py_ssize_t endinpos;
3856 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003858 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003859 const char *errmsg = "";
3860 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003861 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003862 unsigned int base64bits = 0;
3863 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003864 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003865 PyObject *errorHandler = NULL;
3866 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003868 /* Start off assuming it's all ASCII. Widen later as necessary. */
3869 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 if (!unicode)
3871 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003872 if (size == 0) {
3873 if (consumed)
3874 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003875 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003876 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003878 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003879 e = s + size;
3880
3881 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003882 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003884 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003885
Antoine Pitrou244651a2009-05-04 18:56:13 +00003886 if (inShift) { /* in a base-64 section */
3887 if (IS_BASE64(ch)) { /* consume a base-64 character */
3888 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3889 base64bits += 6;
3890 s++;
3891 if (base64bits >= 16) {
3892 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003893 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894 base64bits -= 16;
3895 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3896 if (surrogate) {
3897 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003898 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3899 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003900 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3901 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003903 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 }
3905 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003906 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3907 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 }
3910 }
Victor Stinner551ac952011-11-29 22:58:13 +01003911 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 /* first surrogate */
3913 surrogate = outCh;
3914 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003916 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3917 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003918 }
3919 }
3920 }
3921 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003922 inShift = 0;
3923 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003925 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3926 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003927 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003928 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003929 if (base64bits > 0) { /* left-over bits */
3930 if (base64bits >= 6) {
3931 /* We've seen at least one base-64 character */
3932 errmsg = "partial character in shift sequence";
3933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003935 else {
3936 /* Some bits remain; they should be zero */
3937 if (base64buffer != 0) {
3938 errmsg = "non-zero padding bits in shift sequence";
3939 goto utf7Error;
3940 }
3941 }
3942 }
3943 if (ch != '-') {
3944 /* '-' is absorbed; other terminating
3945 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003946 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3947 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
3950 }
3951 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 s++; /* consume '+' */
3954 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003956 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3957 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003958 }
3959 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003961 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003962 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 }
3964 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003966 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3967 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 s++;
3969 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 else {
3971 startinpos = s-starts;
3972 s++;
3973 errmsg = "unexpected special character";
3974 goto utf7Error;
3975 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003976 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 endinpos = s-starts;
3979 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 errors, &errorHandler,
3981 "utf7", errmsg,
3982 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003983 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985 }
3986
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 /* end of string */
3988
3989 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3990 /* if we're in an inconsistent state, that's an error */
3991 if (surrogate ||
3992 (base64bits >= 6) ||
3993 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994 endinpos = size;
3995 if (unicode_decode_call_errorhandler(
3996 errors, &errorHandler,
3997 "utf7", "unterminated shift sequence",
3998 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003999 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004000 goto onError;
4001 if (s < e)
4002 goto restart;
4003 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004005
4006 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004007 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004008 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004009 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004010 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004011 }
4012 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004013 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004014 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004015 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004017 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004018 goto onError;
4019
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 Py_XDECREF(errorHandler);
4021 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004022 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023
Benjamin Peterson29060642009-01-31 22:14:21 +00004024 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004025 Py_XDECREF(errorHandler);
4026 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004027 Py_DECREF(unicode);
4028 return NULL;
4029}
4030
4031
Alexander Belopolsky40018472011-02-26 01:02:56 +00004032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004033_PyUnicode_EncodeUTF7(PyObject *str,
4034 int base64SetO,
4035 int base64WhiteSpace,
4036 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004038 int kind;
4039 void *data;
4040 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004041 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004042 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004043 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004044 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004045 unsigned int base64bits = 0;
4046 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004047 char * out;
4048 char * start;
4049
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004050 if (PyUnicode_READY(str) < 0)
4051 return NULL;
4052 kind = PyUnicode_KIND(str);
4053 data = PyUnicode_DATA(str);
4054 len = PyUnicode_GET_LENGTH(str);
4055
4056 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004059 /* It might be possible to tighten this worst case */
4060 allocated = 8 * len;
4061 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004062 return PyErr_NoMemory();
4063
Antoine Pitrou244651a2009-05-04 18:56:13 +00004064 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 if (v == NULL)
4066 return NULL;
4067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004068 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004069 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004070 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004071
Antoine Pitrou244651a2009-05-04 18:56:13 +00004072 if (inShift) {
4073 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4074 /* shifting out */
4075 if (base64bits) { /* output remaining bits */
4076 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4077 base64buffer = 0;
4078 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004079 }
4080 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004081 /* Characters not in the BASE64 set implicitly unshift the sequence
4082 so no '-' is required, except if the character is itself a '-' */
4083 if (IS_BASE64(ch) || ch == '-') {
4084 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 *out++ = (char) ch;
4087 }
4088 else {
4089 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004092 else { /* not in a shift sequence */
4093 if (ch == '+') {
4094 *out++ = '+';
4095 *out++ = '-';
4096 }
4097 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4098 *out++ = (char) ch;
4099 }
4100 else {
4101 *out++ = '+';
4102 inShift = 1;
4103 goto encode_char;
4104 }
4105 }
4106 continue;
4107encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004109 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004110
Antoine Pitrou244651a2009-05-04 18:56:13 +00004111 /* code first surrogate */
4112 base64bits += 16;
4113 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4114 while (base64bits >= 6) {
4115 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4116 base64bits -= 6;
4117 }
4118 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004119 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004120 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004121 base64bits += 16;
4122 base64buffer = (base64buffer << 16) | ch;
4123 while (base64bits >= 6) {
4124 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4125 base64bits -= 6;
4126 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004127 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128 if (base64bits)
4129 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4130 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004131 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004132 if (_PyBytes_Resize(&v, out - start) < 0)
4133 return NULL;
4134 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004135}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004136PyObject *
4137PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4138 Py_ssize_t size,
4139 int base64SetO,
4140 int base64WhiteSpace,
4141 const char *errors)
4142{
4143 PyObject *result;
4144 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4145 if (tmp == NULL)
4146 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004147 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004148 base64WhiteSpace, errors);
4149 Py_DECREF(tmp);
4150 return result;
4151}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152
Antoine Pitrou244651a2009-05-04 18:56:13 +00004153#undef IS_BASE64
4154#undef FROM_BASE64
4155#undef TO_BASE64
4156#undef DECODE_DIRECT
4157#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004158
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159/* --- UTF-8 Codec -------------------------------------------------------- */
4160
Tim Petersced69f82003-09-16 20:30:58 +00004161static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004163 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4164 illegal prefix. See RFC 3629 for details */
4165 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4166 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004167 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4169 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4170 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4173 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4175 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4177 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4178 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4179 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4180 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181};
4182
Alexander Belopolsky40018472011-02-26 01:02:56 +00004183PyObject *
4184PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004185 Py_ssize_t size,
4186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187{
Walter Dörwald69652032004-09-07 20:24:22 +00004188 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4189}
4190
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004191#include "stringlib/ucs1lib.h"
4192#include "stringlib/codecs.h"
4193#include "stringlib/undef.h"
4194
4195#include "stringlib/ucs2lib.h"
4196#include "stringlib/codecs.h"
4197#include "stringlib/undef.h"
4198
4199#include "stringlib/ucs4lib.h"
4200#include "stringlib/codecs.h"
4201#include "stringlib/undef.h"
4202
Antoine Pitrouab868312009-01-10 15:40:25 +00004203/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4204#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4205
4206/* Mask to quickly check whether a C 'long' contains a
4207 non-ASCII, UTF8-encoded char. */
4208#if (SIZEOF_LONG == 8)
4209# define ASCII_CHAR_MASK 0x8080808080808080L
4210#elif (SIZEOF_LONG == 4)
4211# define ASCII_CHAR_MASK 0x80808080L
4212#else
4213# error C 'long' size should be either 4 or 8!
4214#endif
4215
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004216/* Scans a UTF-8 string and returns the maximum character to be expected
4217 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004219 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004221 */
4222static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004223utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 const unsigned char *end = p + string_size;
4227 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004229 assert(unicode_size != NULL);
4230
4231 /* By having a cascade of independent loops which fallback onto each
4232 other, we minimize the amount of work done in the average loop
4233 iteration, and we also maximize the CPU's ability to predict
4234 branches correctly (because a given condition will have always the
4235 same boolean outcome except perhaps in the last iteration of the
4236 corresponding loop).
4237 In the general case this brings us rather close to decoding
4238 performance pre-PEP 393, despite the two-pass decoding.
4239
4240 Note that the pure ASCII loop is not duplicated once a non-ASCII
4241 character has been encountered. It is actually a pessimization (by
4242 a significant factor) to use this loop on text with many non-ASCII
4243 characters, and it is important to avoid bad performance on valid
4244 utf-8 data (invalid utf-8 being a different can of worms).
4245 */
4246
4247 /* ASCII */
4248 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004249 /* Only check value if it's not a ASCII char... */
4250 if (*p < 0x80) {
4251 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4252 an explanation. */
4253 if (!((size_t) p & LONG_PTR_MASK)) {
4254 /* Help register allocation */
4255 register const unsigned char *_p = p;
4256 while (_p < aligned_end) {
4257 unsigned long value = *(unsigned long *) _p;
4258 if (value & ASCII_CHAR_MASK)
4259 break;
4260 _p += SIZEOF_LONG;
4261 char_count += SIZEOF_LONG;
4262 }
4263 p = _p;
4264 if (p == end)
4265 break;
4266 }
4267 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004268 if (*p < 0x80)
4269 ++char_count;
4270 else
4271 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004272 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004273 *unicode_size = char_count;
4274 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004275
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004276_ucs1loop:
4277 for (; p < end; ++p) {
4278 if (*p < 0xc4)
4279 char_count += ((*p & 0xc0) != 0x80);
4280 else
4281 goto _ucs2loop;
4282 }
4283 *unicode_size = char_count;
4284 return 255;
4285
4286_ucs2loop:
4287 for (; p < end; ++p) {
4288 if (*p < 0xf0)
4289 char_count += ((*p & 0xc0) != 0x80);
4290 else
4291 goto _ucs4loop;
4292 }
4293 *unicode_size = char_count;
4294 return 65535;
4295
4296_ucs4loop:
4297 for (; p < end; ++p) {
4298 char_count += ((*p & 0xc0) != 0x80);
4299 }
4300 *unicode_size = char_count;
4301 return 65537;
4302}
4303
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004304/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004305 in case of errors. Implicit parameters: unicode, kind, data, onError.
4306 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004307*/
Victor Stinner785938e2011-12-11 20:09:03 +01004308#define WRITE_MAYBE_FAIL(index, value) \
4309 do { \
4310 Py_ssize_t pos = index; \
4311 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4312 unicode_resize(&unicode, pos + pos/8) < 0) \
4313 goto onError; \
4314 if (unicode_putchar(&unicode, &pos, value) < 0) \
4315 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316 } while (0)
4317
Alexander Belopolsky40018472011-02-26 01:02:56 +00004318PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004319decode_utf8_errors(const char *starts,
4320 Py_ssize_t size,
4321 const char *errors,
4322 Py_ssize_t *consumed,
4323 const char *s,
4324 PyObject *unicode,
4325 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004326{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004328 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004329 Py_ssize_t startinpos;
4330 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004331 const char *e = starts + size;
4332 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004333 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004334 PyObject *errorHandler = NULL;
4335 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004336
Antoine Pitrouab868312009-01-10 15:40:25 +00004337 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338
4339 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004340 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341
4342 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004343 /* Fast path for runs of ASCII characters. Given that common UTF-8
4344 input will consist of an overwhelming majority of ASCII
4345 characters, we try to optimize for this case by checking
4346 as many characters as a C 'long' can contain.
4347 First, check if we can do an aligned read, as most CPUs have
4348 a penalty for unaligned reads.
4349 */
4350 if (!((size_t) s & LONG_PTR_MASK)) {
4351 /* Help register allocation */
4352 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004354 while (_s < aligned_end) {
4355 /* Read a whole long at a time (either 4 or 8 bytes),
4356 and do a fast unrolled copy if it only contains ASCII
4357 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 unsigned long value = *(unsigned long *) _s;
4359 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004360 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004361 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4362 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4363 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4364 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004365#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4367 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4368 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4369 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004370#endif
4371 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004372 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004373 }
4374 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004375 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004376 if (s == e)
4377 break;
4378 ch = (unsigned char)*s;
4379 }
4380 }
4381
4382 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004383 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384 s++;
4385 continue;
4386 }
4387
4388 n = utf8_code_length[ch];
4389
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004390 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 if (consumed)
4392 break;
4393 else {
4394 errmsg = "unexpected end of data";
4395 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004396 endinpos = startinpos+1;
4397 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4398 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto utf8Error;
4400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402
4403 switch (n) {
4404
4405 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 startinpos = s-starts;
4408 endinpos = startinpos+1;
4409 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410
4411 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004412 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 startinpos = s-starts;
4414 endinpos = startinpos+1;
4415 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416
4417 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004418 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004419 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004421 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 goto utf8Error;
4423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004425 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004426 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 break;
4428
4429 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004430 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4431 will result in surrogates in range d800-dfff. Surrogates are
4432 not valid UTF-8 so they are rejected.
4433 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4434 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004435 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004436 (s[2] & 0xc0) != 0x80 ||
4437 ((unsigned char)s[0] == 0xE0 &&
4438 (unsigned char)s[1] < 0xA0) ||
4439 ((unsigned char)s[0] == 0xED &&
4440 (unsigned char)s[1] > 0x9F)) {
4441 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 endinpos = startinpos + 1;
4444
4445 /* if s[1] first two bits are 1 and 0, then the invalid
4446 continuation byte is s[2], so increment endinpos by 1,
4447 if not, s[1] is invalid and endinpos doesn't need to
4448 be incremented. */
4449 if ((s[1] & 0xC0) == 0x80)
4450 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 goto utf8Error;
4452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004454 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004456 break;
4457
4458 case 4:
4459 if ((s[1] & 0xc0) != 0x80 ||
4460 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004461 (s[3] & 0xc0) != 0x80 ||
4462 ((unsigned char)s[0] == 0xF0 &&
4463 (unsigned char)s[1] < 0x90) ||
4464 ((unsigned char)s[0] == 0xF4 &&
4465 (unsigned char)s[1] > 0x8F)) {
4466 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004468 endinpos = startinpos + 1;
4469 if ((s[1] & 0xC0) == 0x80) {
4470 endinpos++;
4471 if ((s[2] & 0xC0) == 0x80)
4472 endinpos++;
4473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 goto utf8Error;
4475 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004476 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004477 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004478 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004479
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004480 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 }
4483 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004485
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 if (unicode_decode_call_errorhandler(
4488 errors, &errorHandler,
4489 "utf8", errmsg,
4490 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004491 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004493 /* Update data because unicode_decode_call_errorhandler might have
4494 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 }
Walter Dörwald69652032004-09-07 20:24:22 +00004497 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 /* Adjust length and ready string when it contained errors and
4501 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004502 if (unicode_resize(&unicode, i) < 0)
4503 goto onError;
4504 unicode_adjust_maxchar(&unicode);
4505 if (unicode == NULL)
4506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508 Py_XDECREF(errorHandler);
4509 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004510 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004511 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 Py_XDECREF(errorHandler);
4515 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004516 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517 return NULL;
4518}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004519#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004520
Victor Stinner785938e2011-12-11 20:09:03 +01004521PyObject *
4522PyUnicode_DecodeUTF8Stateful(const char *s,
4523 Py_ssize_t size,
4524 const char *errors,
4525 Py_ssize_t *consumed)
4526{
4527 Py_UCS4 maxchar = 0;
4528 Py_ssize_t unicode_size;
4529 int has_errors = 0;
4530 PyObject *unicode;
4531 int kind;
4532 void *data;
4533 const char *starts = s;
4534 const char *e;
4535 Py_ssize_t i;
4536
4537 if (size == 0) {
4538 if (consumed)
4539 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004540 Py_INCREF(unicode_empty);
4541 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004542 }
4543
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004544 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004545
4546 /* When the string is ASCII only, just use memcpy and return.
4547 unicode_size may be != size if there is an incomplete UTF-8
4548 sequence at the end of the ASCII block. */
4549 if (maxchar < 128 && size == unicode_size) {
4550 if (consumed)
4551 *consumed = size;
4552 return unicode_fromascii(s, size);
4553 }
4554
4555 unicode = PyUnicode_New(unicode_size, maxchar);
4556 if (!unicode)
4557 return NULL;
4558 kind = PyUnicode_KIND(unicode);
4559 data = PyUnicode_DATA(unicode);
4560
4561 /* Unpack UTF-8 encoded data */
4562 i = 0;
4563 e = starts + size;
4564 switch (kind) {
4565 case PyUnicode_1BYTE_KIND:
4566 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4567 break;
4568 case PyUnicode_2BYTE_KIND:
4569 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4570 break;
4571 case PyUnicode_4BYTE_KIND:
4572 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4573 break;
4574 }
4575 if (!has_errors) {
4576 /* Ensure the unicode size calculation was correct */
4577 assert(i == unicode_size);
4578 assert(s == e);
4579 if (consumed)
4580 *consumed = size;
4581 return unicode;
4582 }
4583
4584 /* In case of errors, maxchar and size computation might be incorrect;
4585 code below refits and resizes as necessary. */
4586 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4587}
4588
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004589#ifdef __APPLE__
4590
4591/* Simplified UTF-8 decoder using surrogateescape error handler,
4592 used to decode the command line arguments on Mac OS X. */
4593
4594wchar_t*
4595_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4596{
4597 int n;
4598 const char *e;
4599 wchar_t *unicode, *p;
4600
4601 /* Note: size will always be longer than the resulting Unicode
4602 character count */
4603 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4604 PyErr_NoMemory();
4605 return NULL;
4606 }
4607 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4608 if (!unicode)
4609 return NULL;
4610
4611 /* Unpack UTF-8 encoded data */
4612 p = unicode;
4613 e = s + size;
4614 while (s < e) {
4615 Py_UCS4 ch = (unsigned char)*s;
4616
4617 if (ch < 0x80) {
4618 *p++ = (wchar_t)ch;
4619 s++;
4620 continue;
4621 }
4622
4623 n = utf8_code_length[ch];
4624 if (s + n > e) {
4625 goto surrogateescape;
4626 }
4627
4628 switch (n) {
4629 case 0:
4630 case 1:
4631 goto surrogateescape;
4632
4633 case 2:
4634 if ((s[1] & 0xc0) != 0x80)
4635 goto surrogateescape;
4636 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4637 assert ((ch > 0x007F) && (ch <= 0x07FF));
4638 *p++ = (wchar_t)ch;
4639 break;
4640
4641 case 3:
4642 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4643 will result in surrogates in range d800-dfff. Surrogates are
4644 not valid UTF-8 so they are rejected.
4645 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4646 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4647 if ((s[1] & 0xc0) != 0x80 ||
4648 (s[2] & 0xc0) != 0x80 ||
4649 ((unsigned char)s[0] == 0xE0 &&
4650 (unsigned char)s[1] < 0xA0) ||
4651 ((unsigned char)s[0] == 0xED &&
4652 (unsigned char)s[1] > 0x9F)) {
4653
4654 goto surrogateescape;
4655 }
4656 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4657 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004658 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004659 break;
4660
4661 case 4:
4662 if ((s[1] & 0xc0) != 0x80 ||
4663 (s[2] & 0xc0) != 0x80 ||
4664 (s[3] & 0xc0) != 0x80 ||
4665 ((unsigned char)s[0] == 0xF0 &&
4666 (unsigned char)s[1] < 0x90) ||
4667 ((unsigned char)s[0] == 0xF4 &&
4668 (unsigned char)s[1] > 0x8F)) {
4669 goto surrogateescape;
4670 }
4671 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4672 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004673 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004674
4675#if SIZEOF_WCHAR_T == 4
4676 *p++ = (wchar_t)ch;
4677#else
4678 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004679 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4680 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004681#endif
4682 break;
4683 }
4684 s += n;
4685 continue;
4686
4687 surrogateescape:
4688 *p++ = 0xDC00 + ch;
4689 s++;
4690 }
4691 *p = L'\0';
4692 return unicode;
4693}
4694
4695#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697/* Primary internal function which creates utf8 encoded bytes objects.
4698
4699 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004700 and allocate exactly as much space needed at the end. Else allocate the
4701 maximum possible needed (4 result bytes per Unicode character), and return
4702 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004703*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004704PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004705_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706{
Tim Peters602f7402002-04-27 18:03:26 +00004707#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004708
Guido van Rossum98297ee2007-11-06 21:34:58 +00004709 Py_ssize_t i; /* index into s of next input byte */
4710 PyObject *result; /* result string object */
4711 char *p; /* next free byte in output buffer */
4712 Py_ssize_t nallocated; /* number of result bytes allocated */
4713 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004714 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004715 PyObject *errorHandler = NULL;
4716 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004717 int kind;
4718 void *data;
4719 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004720 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004722 if (!PyUnicode_Check(unicode)) {
4723 PyErr_BadArgument();
4724 return NULL;
4725 }
4726
4727 if (PyUnicode_READY(unicode) == -1)
4728 return NULL;
4729
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004730 if (PyUnicode_UTF8(unicode))
4731 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4732 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004733
4734 kind = PyUnicode_KIND(unicode);
4735 data = PyUnicode_DATA(unicode);
4736 size = PyUnicode_GET_LENGTH(unicode);
4737
Tim Peters602f7402002-04-27 18:03:26 +00004738 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739
Tim Peters602f7402002-04-27 18:03:26 +00004740 if (size <= MAX_SHORT_UNICHARS) {
4741 /* Write into the stack buffer; nallocated can't overflow.
4742 * At the end, we'll allocate exactly as much heap space as it
4743 * turns out we need.
4744 */
4745 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004746 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004747 p = stackbuf;
4748 }
4749 else {
4750 /* Overallocate on the heap, and give the excess back at the end. */
4751 nallocated = size * 4;
4752 if (nallocated / 4 != size) /* overflow! */
4753 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004754 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004755 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004756 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004757 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004758 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004759
Tim Peters602f7402002-04-27 18:03:26 +00004760 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004762
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004763 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004764 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004766
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004768 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004769 *p++ = (char)(0xc0 | (ch >> 6));
4770 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004771 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 Py_ssize_t repsize, k, startpos;
4774 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 rep = unicode_encode_call_errorhandler(
4776 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004777 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 if (!rep)
4779 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004781 if (PyBytes_Check(rep))
4782 repsize = PyBytes_GET_SIZE(rep);
4783 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004784 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785
4786 if (repsize > 4) {
4787 Py_ssize_t offset;
4788
4789 if (result == NULL)
4790 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004791 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4795 /* integer overflow */
4796 PyErr_NoMemory();
4797 goto error;
4798 }
4799 nallocated += repsize - 4;
4800 if (result != NULL) {
4801 if (_PyBytes_Resize(&result, nallocated) < 0)
4802 goto error;
4803 } else {
4804 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004805 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004806 goto error;
4807 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4808 }
4809 p = PyBytes_AS_STRING(result) + offset;
4810 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812 if (PyBytes_Check(rep)) {
4813 char *prep = PyBytes_AS_STRING(rep);
4814 for(k = repsize; k > 0; k--)
4815 *p++ = *prep++;
4816 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004817 enum PyUnicode_Kind repkind;
4818 void *repdata;
4819
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004820 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004821 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004822 repkind = PyUnicode_KIND(rep);
4823 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824
4825 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004826 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004828 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004829 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004830 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004832 goto error;
4833 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004834 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004835 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004836 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004837 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004838 } else if (ch < 0x10000) {
4839 *p++ = (char)(0xe0 | (ch >> 12));
4840 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4841 *p++ = (char)(0x80 | (ch & 0x3f));
4842 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004843 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004844 /* Encode UCS4 Unicode ordinals */
4845 *p++ = (char)(0xf0 | (ch >> 18));
4846 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4847 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4848 *p++ = (char)(0x80 | (ch & 0x3f));
4849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004851
Guido van Rossum98297ee2007-11-06 21:34:58 +00004852 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004853 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004854 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004855 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004856 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004857 }
4858 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004859 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004860 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004861 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004862 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004864
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004865 Py_XDECREF(errorHandler);
4866 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004867 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004868 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004869 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004870 Py_XDECREF(errorHandler);
4871 Py_XDECREF(exc);
4872 Py_XDECREF(result);
4873 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004874
Tim Peters602f7402002-04-27 18:03:26 +00004875#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876}
4877
Alexander Belopolsky40018472011-02-26 01:02:56 +00004878PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4880 Py_ssize_t size,
4881 const char *errors)
4882{
4883 PyObject *v, *unicode;
4884
4885 unicode = PyUnicode_FromUnicode(s, size);
4886 if (unicode == NULL)
4887 return NULL;
4888 v = _PyUnicode_AsUTF8String(unicode, errors);
4889 Py_DECREF(unicode);
4890 return v;
4891}
4892
4893PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004894PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004896 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897}
4898
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899/* --- UTF-32 Codec ------------------------------------------------------- */
4900
4901PyObject *
4902PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 Py_ssize_t size,
4904 const char *errors,
4905 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906{
4907 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4908}
4909
4910PyObject *
4911PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 Py_ssize_t size,
4913 const char *errors,
4914 int *byteorder,
4915 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916{
4917 const char *starts = s;
4918 Py_ssize_t startinpos;
4919 Py_ssize_t endinpos;
4920 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004921 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004922 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004923 int bo = 0; /* assume native ordering by default */
4924 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925 /* Offsets from q for retrieving bytes in the right order. */
4926#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4927 int iorder[] = {0, 1, 2, 3};
4928#else
4929 int iorder[] = {3, 2, 1, 0};
4930#endif
4931 PyObject *errorHandler = NULL;
4932 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004933
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934 q = (unsigned char *)s;
4935 e = q + size;
4936
4937 if (byteorder)
4938 bo = *byteorder;
4939
4940 /* Check for BOM marks (U+FEFF) in the input and adjust current
4941 byte order setting accordingly. In native mode, the leading BOM
4942 mark is skipped, in all other modes, it is copied to the output
4943 stream as-is (giving a ZWNBSP character). */
4944 if (bo == 0) {
4945 if (size >= 4) {
4946 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 if (bom == 0x0000FEFF) {
4950 q += 4;
4951 bo = -1;
4952 }
4953 else if (bom == 0xFFFE0000) {
4954 q += 4;
4955 bo = 1;
4956 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 if (bom == 0x0000FEFF) {
4959 q += 4;
4960 bo = 1;
4961 }
4962 else if (bom == 0xFFFE0000) {
4963 q += 4;
4964 bo = -1;
4965 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 }
4969
4970 if (bo == -1) {
4971 /* force LE */
4972 iorder[0] = 0;
4973 iorder[1] = 1;
4974 iorder[2] = 2;
4975 iorder[3] = 3;
4976 }
4977 else if (bo == 1) {
4978 /* force BE */
4979 iorder[0] = 3;
4980 iorder[1] = 2;
4981 iorder[2] = 1;
4982 iorder[3] = 0;
4983 }
4984
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004985 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004986 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004987 if (!unicode)
4988 return NULL;
4989 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004990 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004991 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004992
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 Py_UCS4 ch;
4995 /* remaining bytes at the end? (size should be divisible by 4) */
4996 if (e-q<4) {
4997 if (consumed)
4998 break;
4999 errmsg = "truncated data";
5000 startinpos = ((const char *)q)-starts;
5001 endinpos = ((const char *)e)-starts;
5002 goto utf32Error;
5003 /* The remaining input chars are ignored if the callback
5004 chooses to skip the input */
5005 }
5006 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5007 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if (ch >= 0x110000)
5010 {
5011 errmsg = "codepoint not in range(0x110000)";
5012 startinpos = ((const char *)q)-starts;
5013 endinpos = startinpos+4;
5014 goto utf32Error;
5015 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005016 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5017 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 q += 4;
5019 continue;
5020 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 if (unicode_decode_call_errorhandler(
5022 errors, &errorHandler,
5023 "utf32", errmsg,
5024 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005025 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027 }
5028
5029 if (byteorder)
5030 *byteorder = bo;
5031
5032 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034
5035 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005036 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037 goto onError;
5038
5039 Py_XDECREF(errorHandler);
5040 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005041 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 Py_DECREF(unicode);
5045 Py_XDECREF(errorHandler);
5046 Py_XDECREF(exc);
5047 return NULL;
5048}
5049
5050PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005051_PyUnicode_EncodeUTF32(PyObject *str,
5052 const char *errors,
5053 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005055 int kind;
5056 void *data;
5057 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005058 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005060 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 /* Offsets from p for storing byte pairs in the right order. */
5062#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5063 int iorder[] = {0, 1, 2, 3};
5064#else
5065 int iorder[] = {3, 2, 1, 0};
5066#endif
5067
Benjamin Peterson29060642009-01-31 22:14:21 +00005068#define STORECHAR(CH) \
5069 do { \
5070 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5071 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5072 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5073 p[iorder[0]] = (CH) & 0xff; \
5074 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 } while(0)
5076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005077 if (!PyUnicode_Check(str)) {
5078 PyErr_BadArgument();
5079 return NULL;
5080 }
5081 if (PyUnicode_READY(str) < 0)
5082 return NULL;
5083 kind = PyUnicode_KIND(str);
5084 data = PyUnicode_DATA(str);
5085 len = PyUnicode_GET_LENGTH(str);
5086
5087 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005088 bytesize = nsize * 4;
5089 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005091 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 if (v == NULL)
5093 return NULL;
5094
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005095 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005099 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100
5101 if (byteorder == -1) {
5102 /* force LE */
5103 iorder[0] = 0;
5104 iorder[1] = 1;
5105 iorder[2] = 2;
5106 iorder[3] = 3;
5107 }
5108 else if (byteorder == 1) {
5109 /* force BE */
5110 iorder[0] = 3;
5111 iorder[1] = 2;
5112 iorder[2] = 1;
5113 iorder[3] = 0;
5114 }
5115
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005116 for (i = 0; i < len; i++)
5117 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005118
5119 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005120 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121#undef STORECHAR
5122}
5123
Alexander Belopolsky40018472011-02-26 01:02:56 +00005124PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5126 Py_ssize_t size,
5127 const char *errors,
5128 int byteorder)
5129{
5130 PyObject *result;
5131 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5132 if (tmp == NULL)
5133 return NULL;
5134 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5135 Py_DECREF(tmp);
5136 return result;
5137}
5138
5139PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005140PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141{
Victor Stinnerb960b342011-11-20 19:12:52 +01005142 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143}
5144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145/* --- UTF-16 Codec ------------------------------------------------------- */
5146
Tim Peters772747b2001-08-09 22:21:55 +00005147PyObject *
5148PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 Py_ssize_t size,
5150 const char *errors,
5151 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152{
Walter Dörwald69652032004-09-07 20:24:22 +00005153 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5154}
5155
Antoine Pitrouab868312009-01-10 15:40:25 +00005156/* Two masks for fast checking of whether a C 'long' may contain
5157 UTF16-encoded surrogate characters. This is an efficient heuristic,
5158 assuming that non-surrogate characters with a code point >= 0x8000 are
5159 rare in most input.
5160 FAST_CHAR_MASK is used when the input is in native byte ordering,
5161 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005162*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005163#if (SIZEOF_LONG == 8)
5164# define FAST_CHAR_MASK 0x8000800080008000L
5165# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5166#elif (SIZEOF_LONG == 4)
5167# define FAST_CHAR_MASK 0x80008000L
5168# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5169#else
5170# error C 'long' size should be either 4 or 8!
5171#endif
5172
Walter Dörwald69652032004-09-07 20:24:22 +00005173PyObject *
5174PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 Py_ssize_t size,
5176 const char *errors,
5177 int *byteorder,
5178 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005179{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005181 Py_ssize_t startinpos;
5182 Py_ssize_t endinpos;
5183 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005184 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005185 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005186 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005187 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005188 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005189 /* Offsets from q for retrieving byte pairs in the right order. */
5190#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5191 int ihi = 1, ilo = 0;
5192#else
5193 int ihi = 0, ilo = 1;
5194#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 PyObject *errorHandler = NULL;
5196 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197
5198 /* Note: size will always be longer than the resulting Unicode
5199 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005200 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 if (!unicode)
5202 return NULL;
5203 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005204 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005205 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206
Tim Peters772747b2001-08-09 22:21:55 +00005207 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005208 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209
5210 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005211 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005213 /* Check for BOM marks (U+FEFF) in the input and adjust current
5214 byte order setting accordingly. In native mode, the leading BOM
5215 mark is skipped, in all other modes, it is copied to the output
5216 stream as-is (giving a ZWNBSP character). */
5217 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005218 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005219 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005220#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005221 if (bom == 0xFEFF) {
5222 q += 2;
5223 bo = -1;
5224 }
5225 else if (bom == 0xFFFE) {
5226 q += 2;
5227 bo = 1;
5228 }
Tim Petersced69f82003-09-16 20:30:58 +00005229#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 if (bom == 0xFEFF) {
5231 q += 2;
5232 bo = 1;
5233 }
5234 else if (bom == 0xFFFE) {
5235 q += 2;
5236 bo = -1;
5237 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005238#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241
Tim Peters772747b2001-08-09 22:21:55 +00005242 if (bo == -1) {
5243 /* force LE */
5244 ihi = 1;
5245 ilo = 0;
5246 }
5247 else if (bo == 1) {
5248 /* force BE */
5249 ihi = 0;
5250 ilo = 1;
5251 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005252#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5253 native_ordering = ilo < ihi;
5254#else
5255 native_ordering = ilo > ihi;
5256#endif
Tim Peters772747b2001-08-09 22:21:55 +00005257
Antoine Pitrouab868312009-01-10 15:40:25 +00005258 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005259 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005260 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005261 /* First check for possible aligned read of a C 'long'. Unaligned
5262 reads are more expensive, better to defer to another iteration. */
5263 if (!((size_t) q & LONG_PTR_MASK)) {
5264 /* Fast path for runs of non-surrogate chars. */
5265 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005266 int kind = PyUnicode_KIND(unicode);
5267 void *data = PyUnicode_DATA(unicode);
5268 while (_q < aligned_end) {
5269 unsigned long block = * (unsigned long *) _q;
5270 unsigned short *pblock = (unsigned short*)&block;
5271 Py_UCS4 maxch;
5272 if (native_ordering) {
5273 /* Can use buffer directly */
5274 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005275 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005276 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005277 else {
5278 /* Need to byte-swap */
5279 unsigned char *_p = (unsigned char*)pblock;
5280 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005281 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005282 _p[0] = _q[1];
5283 _p[1] = _q[0];
5284 _p[2] = _q[3];
5285 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005286#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005287 _p[4] = _q[5];
5288 _p[5] = _q[4];
5289 _p[6] = _q[7];
5290 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005291#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005292 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 maxch = Py_MAX(pblock[0], pblock[1]);
5294#if SIZEOF_LONG == 8
5295 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5296#endif
5297 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5298 if (unicode_widen(&unicode, maxch) < 0)
5299 goto onError;
5300 kind = PyUnicode_KIND(unicode);
5301 data = PyUnicode_DATA(unicode);
5302 }
5303 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5304 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5305#if SIZEOF_LONG == 8
5306 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5307 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5308#endif
5309 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005310 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005311 q = _q;
5312 if (q >= e)
5313 break;
5314 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005316
Benjamin Peterson14339b62009-01-31 16:36:08 +00005317 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005318
Victor Stinner551ac952011-11-29 22:58:13 +01005319 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005320 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5321 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 continue;
5323 }
5324
5325 /* UTF-16 code pair: */
5326 if (q > e) {
5327 errmsg = "unexpected end of data";
5328 startinpos = (((const char *)q) - 2) - starts;
5329 endinpos = ((const char *)e) + 1 - starts;
5330 goto utf16Error;
5331 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005332 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5333 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005335 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005336 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005337 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005339 continue;
5340 }
5341 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005342 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 startinpos = (((const char *)q)-4)-starts;
5344 endinpos = startinpos+2;
5345 goto utf16Error;
5346 }
5347
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 errmsg = "illegal encoding";
5350 startinpos = (((const char *)q)-2)-starts;
5351 endinpos = startinpos+2;
5352 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005353
Benjamin Peterson29060642009-01-31 22:14:21 +00005354 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 errors,
5357 &errorHandler,
5358 "utf16", errmsg,
5359 &starts,
5360 (const char **)&e,
5361 &startinpos,
5362 &endinpos,
5363 &exc,
5364 (const char **)&q,
5365 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005366 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005369 /* remaining byte at the end? (size should be even) */
5370 if (e == q) {
5371 if (!consumed) {
5372 errmsg = "truncated data";
5373 startinpos = ((const char *)q) - starts;
5374 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 if (unicode_decode_call_errorhandler(
5376 errors,
5377 &errorHandler,
5378 "utf16", errmsg,
5379 &starts,
5380 (const char **)&e,
5381 &startinpos,
5382 &endinpos,
5383 &exc,
5384 (const char **)&q,
5385 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005386 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005387 goto onError;
5388 /* The remaining input chars are ignored if the callback
5389 chooses to skip the input */
5390 }
5391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
5393 if (byteorder)
5394 *byteorder = bo;
5395
Walter Dörwald69652032004-09-07 20:24:22 +00005396 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005400 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 goto onError;
5402
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 Py_XDECREF(errorHandler);
5404 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005405 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 Py_XDECREF(errorHandler);
5410 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 return NULL;
5412}
5413
Antoine Pitrouab868312009-01-10 15:40:25 +00005414#undef FAST_CHAR_MASK
5415#undef SWAPPED_FAST_CHAR_MASK
5416
Tim Peters772747b2001-08-09 22:21:55 +00005417PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418_PyUnicode_EncodeUTF16(PyObject *str,
5419 const char *errors,
5420 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005422 int kind;
5423 void *data;
5424 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005425 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005426 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005427 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005428 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005429 /* Offsets from p for storing byte pairs in the right order. */
5430#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5431 int ihi = 1, ilo = 0;
5432#else
5433 int ihi = 0, ilo = 1;
5434#endif
5435
Benjamin Peterson29060642009-01-31 22:14:21 +00005436#define STORECHAR(CH) \
5437 do { \
5438 p[ihi] = ((CH) >> 8) & 0xff; \
5439 p[ilo] = (CH) & 0xff; \
5440 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005441 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005443 if (!PyUnicode_Check(str)) {
5444 PyErr_BadArgument();
5445 return NULL;
5446 }
5447 if (PyUnicode_READY(str) < 0)
5448 return NULL;
5449 kind = PyUnicode_KIND(str);
5450 data = PyUnicode_DATA(str);
5451 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005452
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005453 pairs = 0;
5454 if (kind == PyUnicode_4BYTE_KIND)
5455 for (i = 0; i < len; i++)
5456 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5457 pairs++;
5458 /* 2 * (len + pairs + (byteorder == 0)) */
5459 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005461 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005462 bytesize = nsize * 2;
5463 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005465 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (v == NULL)
5467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005469 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005471 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005473 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005474
5475 if (byteorder == -1) {
5476 /* force LE */
5477 ihi = 1;
5478 ilo = 0;
5479 }
5480 else if (byteorder == 1) {
5481 /* force BE */
5482 ihi = 0;
5483 ilo = 1;
5484 }
5485
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005486 for (i = 0; i < len; i++) {
5487 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5488 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005490 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5491 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005492 }
Tim Peters772747b2001-08-09 22:21:55 +00005493 STORECHAR(ch);
5494 if (ch2)
5495 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005496 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005497
5498 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005499 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005500#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501}
5502
Alexander Belopolsky40018472011-02-26 01:02:56 +00005503PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005504PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5505 Py_ssize_t size,
5506 const char *errors,
5507 int byteorder)
5508{
5509 PyObject *result;
5510 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5511 if (tmp == NULL)
5512 return NULL;
5513 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5514 Py_DECREF(tmp);
5515 return result;
5516}
5517
5518PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005519PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522}
5523
5524/* --- Unicode Escape Codec ----------------------------------------------- */
5525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005526/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5527 if all the escapes in the string make it still a valid ASCII string.
5528 Returns -1 if any escapes were found which cause the string to
5529 pop out of ASCII range. Otherwise returns the length of the
5530 required buffer to hold the string.
5531 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005532static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5534{
5535 const unsigned char *p = (const unsigned char *)s;
5536 const unsigned char *end = p + size;
5537 Py_ssize_t length = 0;
5538
5539 if (size < 0)
5540 return -1;
5541
5542 for (; p < end; ++p) {
5543 if (*p > 127) {
5544 /* Non-ASCII */
5545 return -1;
5546 }
5547 else if (*p != '\\') {
5548 /* Normal character */
5549 ++length;
5550 }
5551 else {
5552 /* Backslash-escape, check next char */
5553 ++p;
5554 /* Escape sequence reaches till end of string or
5555 non-ASCII follow-up. */
5556 if (p >= end || *p > 127)
5557 return -1;
5558 switch (*p) {
5559 case '\n':
5560 /* backslash + \n result in zero characters */
5561 break;
5562 case '\\': case '\'': case '\"':
5563 case 'b': case 'f': case 't':
5564 case 'n': case 'r': case 'v': case 'a':
5565 ++length;
5566 break;
5567 case '0': case '1': case '2': case '3':
5568 case '4': case '5': case '6': case '7':
5569 case 'x': case 'u': case 'U': case 'N':
5570 /* these do not guarantee ASCII characters */
5571 return -1;
5572 default:
5573 /* count the backslash + the other character */
5574 length += 2;
5575 }
5576 }
5577 }
5578 return length;
5579}
5580
Fredrik Lundh06d12682001-01-24 07:59:11 +00005581static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005582
Alexander Belopolsky40018472011-02-26 01:02:56 +00005583PyObject *
5584PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005585 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005589 Py_ssize_t startinpos;
5590 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005592 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005594 char* message;
5595 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 PyObject *errorHandler = NULL;
5597 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005598 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005600
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602
5603 /* After length_of_escaped_ascii_string() there are two alternatives,
5604 either the string is pure ASCII with named escapes like \n, etc.
5605 and we determined it's exact size (common case)
5606 or it contains \x, \u, ... escape sequences. then we create a
5607 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005608 if (len >= 0) {
5609 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 if (!v)
5611 goto onError;
5612 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 }
5614 else {
5615 /* Escaped strings will always be longer than the resulting
5616 Unicode string, so we start with size here and then reduce the
5617 length after conversion to the true value.
5618 (but if the error callback returns a long replacement string
5619 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 if (!v)
5622 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005623 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624 }
5625
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005627 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005630
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 while (s < end) {
5632 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005633 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005636 /* The only case in which i == ascii_length is a backslash
5637 followed by a newline. */
5638 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005639
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 /* Non-escape characters are interpreted as Unicode ordinals */
5641 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5643 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 continue;
5645 }
5646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 /* \ - Escapes */
5649 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005650 c = *s++;
5651 if (s > end)
5652 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005654 /* The only case in which i == ascii_length is a backslash
5655 followed by a newline. */
5656 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005658 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661#define WRITECHAR(ch) \
5662 do { \
5663 if (unicode_putchar(&v, &i, ch) < 0) \
5664 goto onError; \
5665 }while(0)
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668 case '\\': WRITECHAR('\\'); break;
5669 case '\'': WRITECHAR('\''); break;
5670 case '\"': WRITECHAR('\"'); break;
5671 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005672 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 case 'f': WRITECHAR('\014'); break;
5674 case 't': WRITECHAR('\t'); break;
5675 case 'n': WRITECHAR('\n'); break;
5676 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005678 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 case '0': case '1': case '2': case '3':
5684 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005685 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005686 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005687 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005688 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005689 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 break;
5693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* hex escapes */
5695 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005697 digits = 2;
5698 message = "truncated \\xXX escape";
5699 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005703 digits = 4;
5704 message = "truncated \\uXXXX escape";
5705 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005708 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005709 digits = 8;
5710 message = "truncated \\UXXXXXXXX escape";
5711 hexescape:
5712 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 if (s+digits>end) {
5714 endinpos = size;
5715 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 errors, &errorHandler,
5717 "unicodeescape", "end of string in escape sequence",
5718 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 goto onError;
5721 goto nextByte;
5722 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 for (j = 0; j < digits; ++j) {
5724 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005725 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 errors, &errorHandler,
5729 "unicodeescape", message,
5730 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005732 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005734 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005735 }
5736 chr = (chr<<4) & ~0xF;
5737 if (c >= '0' && c <= '9')
5738 chr += c - '0';
5739 else if (c >= 'a' && c <= 'f')
5740 chr += 10 + c - 'a';
5741 else
5742 chr += 10 + c - 'A';
5743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005745 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 /* _decoding_error will have already written into the
5747 target buffer. */
5748 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005749 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005750 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005751 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005753 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005754 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 errors, &errorHandler,
5757 "unicodeescape", "illegal Unicode character",
5758 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005760 goto onError;
5761 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 break;
5763
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 case 'N':
5766 message = "malformed \\N character escape";
5767 if (ucnhash_CAPI == NULL) {
5768 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005769 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5770 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771 if (ucnhash_CAPI == NULL)
5772 goto ucnhashError;
5773 }
5774 if (*s == '{') {
5775 const char *start = s+1;
5776 /* look for the closing brace */
5777 while (*s != '}' && s < end)
5778 s++;
5779 if (s > start && s < end && *s == '}') {
5780 /* found a name. look it up in the unicode database */
5781 message = "unknown Unicode character name";
5782 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005784 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005785 goto store;
5786 }
5787 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 errors, &errorHandler,
5791 "unicodeescape", message,
5792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005794 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005795 break;
5796
5797 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005798 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 message = "\\ at end of string";
5800 s--;
5801 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 errors, &errorHandler,
5804 "unicodeescape", message,
5805 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005806 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005807 goto onError;
5808 }
5809 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005810 WRITECHAR('\\');
5811 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005812 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005816 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005818#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005819
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005820 if (PyUnicode_Resize(&v, i) < 0)
5821 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005822 Py_XDECREF(errorHandler);
5823 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005824 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005825
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005827 PyErr_SetString(
5828 PyExc_UnicodeError,
5829 "\\N escapes not supported (can't load unicodedata module)"
5830 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005831 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 Py_XDECREF(errorHandler);
5833 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005834 return NULL;
5835
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 Py_XDECREF(errorHandler);
5839 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 return NULL;
5841}
5842
5843/* Return a Unicode-Escape string version of the Unicode object.
5844
5845 If quotes is true, the string is enclosed in u"" or u'' quotes as
5846 appropriate.
5847
5848*/
5849
Alexander Belopolsky40018472011-02-26 01:02:56 +00005850PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005851PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005853 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005856 int kind;
5857 void *data;
5858 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
Thomas Wouters89f507f2006-12-13 04:49:30 +00005860 /* Initial allocation is based on the longest-possible unichr
5861 escape.
5862
5863 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5864 unichr, so in this case it's the longest unichr escape. In
5865 narrow (UTF-16) builds this is five chars per source unichr
5866 since there are two unichrs in the surrogate pair, so in narrow
5867 (UTF-16) builds it's not the longest unichr escape.
5868
5869 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5870 so in the narrow (UTF-16) build case it's the longest unichr
5871 escape.
5872 */
5873
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 if (!PyUnicode_Check(unicode)) {
5875 PyErr_BadArgument();
5876 return NULL;
5877 }
5878 if (PyUnicode_READY(unicode) < 0)
5879 return NULL;
5880 len = PyUnicode_GET_LENGTH(unicode);
5881 kind = PyUnicode_KIND(unicode);
5882 data = PyUnicode_DATA(unicode);
5883 switch(kind) {
5884 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5885 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5886 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5887 }
5888
5889 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005890 return PyBytes_FromStringAndSize(NULL, 0);
5891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005894
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005895 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 if (repr == NULL)
5900 return NULL;
5901
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005905 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005906
Walter Dörwald79e913e2007-05-12 11:08:06 +00005907 /* Escape backslashes */
5908 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 *p++ = '\\';
5910 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005911 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005912 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005913
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005914 /* Map 21-bit characters to '\U00xxxxxx' */
5915 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005916 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005917 *p++ = '\\';
5918 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005919 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5920 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5921 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5922 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5923 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5924 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5925 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5926 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005928 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005929
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005931 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 *p++ = '\\';
5933 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005934 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5935 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5936 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5937 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005939
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005940 /* Map special whitespace to '\t', \n', '\r' */
5941 else if (ch == '\t') {
5942 *p++ = '\\';
5943 *p++ = 't';
5944 }
5945 else if (ch == '\n') {
5946 *p++ = '\\';
5947 *p++ = 'n';
5948 }
5949 else if (ch == '\r') {
5950 *p++ = '\\';
5951 *p++ = 'r';
5952 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005953
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005954 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005955 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005957 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005958 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5959 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005960 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005961
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 /* Copy everything else as-is */
5963 else
5964 *p++ = (char) ch;
5965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005967 assert(p - PyBytes_AS_STRING(repr) > 0);
5968 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5969 return NULL;
5970 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971}
5972
Alexander Belopolsky40018472011-02-26 01:02:56 +00005973PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005974PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5975 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977 PyObject *result;
5978 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5979 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981 result = PyUnicode_AsUnicodeEscapeString(tmp);
5982 Py_DECREF(tmp);
5983 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984}
5985
5986/* --- Raw Unicode Escape Codec ------------------------------------------- */
5987
Alexander Belopolsky40018472011-02-26 01:02:56 +00005988PyObject *
5989PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005990 Py_ssize_t size,
5991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005994 Py_ssize_t startinpos;
5995 Py_ssize_t endinpos;
5996 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005997 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 const char *end;
5999 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 PyObject *errorHandler = NULL;
6001 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006002
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003 /* Escaped strings will always be longer than the resulting
6004 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 length after conversion to the true value. (But decoding error
6006 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006011 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 end = s + size;
6014 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 unsigned char c;
6016 Py_UCS4 x;
6017 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006018 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 /* Non-escape characters are interpreted as Unicode ordinals */
6021 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006022 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6023 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006025 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 startinpos = s-starts;
6027
6028 /* \u-escapes are only interpreted iff the number of leading
6029 backslashes if odd */
6030 bs = s;
6031 for (;s < end;) {
6032 if (*s != '\\')
6033 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006034 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6035 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 }
6037 if (((s - bs) & 1) == 0 ||
6038 s >= end ||
6039 (*s != 'u' && *s != 'U')) {
6040 continue;
6041 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006042 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 count = *s=='u' ? 4 : 8;
6044 s++;
6045
6046 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 for (x = 0, i = 0; i < count; ++i, ++s) {
6048 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006049 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 endinpos = s-starts;
6051 if (unicode_decode_call_errorhandler(
6052 errors, &errorHandler,
6053 "rawunicodeescape", "truncated \\uXXXX",
6054 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006055 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 goto onError;
6057 goto nextByte;
6058 }
6059 x = (x<<4) & ~0xF;
6060 if (c >= '0' && c <= '9')
6061 x += c - '0';
6062 else if (c >= 'a' && c <= 'f')
6063 x += 10 + c - 'a';
6064 else
6065 x += 10 + c - 'A';
6066 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006067 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006068 if (unicode_putchar(&v, &outpos, x) < 0)
6069 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006070 } else {
6071 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006072 if (unicode_decode_call_errorhandler(
6073 errors, &errorHandler,
6074 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006076 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006078 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 nextByte:
6080 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006082 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 Py_XDECREF(errorHandler);
6085 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006086 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 Py_XDECREF(errorHandler);
6091 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 return NULL;
6093}
6094
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006095
Alexander Belopolsky40018472011-02-26 01:02:56 +00006096PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006097PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006099 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 char *p;
6101 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102 Py_ssize_t expandsize, pos;
6103 int kind;
6104 void *data;
6105 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107 if (!PyUnicode_Check(unicode)) {
6108 PyErr_BadArgument();
6109 return NULL;
6110 }
6111 if (PyUnicode_READY(unicode) < 0)
6112 return NULL;
6113 kind = PyUnicode_KIND(unicode);
6114 data = PyUnicode_DATA(unicode);
6115 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006116 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6117 bytes, and 1 byte characters 4. */
6118 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006120 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 if (repr == NULL)
6125 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006127 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006129 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006130 for (pos = 0; pos < len; pos++) {
6131 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 /* Map 32-bit characters to '\Uxxxxxxxx' */
6133 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006134 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006135 *p++ = '\\';
6136 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006137 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6138 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6139 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6140 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6141 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6142 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6143 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6144 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006145 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 *p++ = '\\';
6149 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006150 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6151 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6152 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6153 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 /* Copy everything else as-is */
6156 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 *p++ = (char) ch;
6158 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006159
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 assert(p > q);
6161 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006162 return NULL;
6163 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164}
6165
Alexander Belopolsky40018472011-02-26 01:02:56 +00006166PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006167PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6168 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 PyObject *result;
6171 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6172 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006173 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6175 Py_DECREF(tmp);
6176 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177}
6178
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006179/* --- Unicode Internal Codec ------------------------------------------- */
6180
Alexander Belopolsky40018472011-02-26 01:02:56 +00006181PyObject *
6182_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006183 Py_ssize_t size,
6184 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006185{
6186 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006187 Py_ssize_t startinpos;
6188 Py_ssize_t endinpos;
6189 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006190 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006191 const char *end;
6192 const char *reason;
6193 PyObject *errorHandler = NULL;
6194 PyObject *exc = NULL;
6195
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006196 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006197 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006198 1))
6199 return NULL;
6200
Thomas Wouters89f507f2006-12-13 04:49:30 +00006201 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006202 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006205 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006206 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006207 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208 end = s + size;
6209
6210 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006211 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006212 Py_UCS4 ch;
6213 /* We copy the raw representation one byte at a time because the
6214 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006215 ((char *) &uch)[0] = s[0];
6216 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006217#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006218 ((char *) &uch)[2] = s[2];
6219 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006220#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006221 ch = uch;
6222
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006223 /* We have to sanity check the raw data, otherwise doom looms for
6224 some malformed UCS-4 data. */
6225 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006226#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006227 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006228#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006229 end-s < Py_UNICODE_SIZE
6230 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006232 startinpos = s - starts;
6233 if (end-s < Py_UNICODE_SIZE) {
6234 endinpos = end-starts;
6235 reason = "truncated input";
6236 }
6237 else {
6238 endinpos = s - starts + Py_UNICODE_SIZE;
6239 reason = "illegal code point (> 0x10FFFF)";
6240 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006241 if (unicode_decode_call_errorhandler(
6242 errors, &errorHandler,
6243 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006244 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006245 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006246 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006247 continue;
6248 }
6249
6250 s += Py_UNICODE_SIZE;
6251#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006252 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006253 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006254 Py_UNICODE uch2;
6255 ((char *) &uch2)[0] = s[0];
6256 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006257 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006258 {
Victor Stinner551ac952011-11-29 22:58:13 +01006259 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006260 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 }
6262 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006263#endif
6264
6265 if (unicode_putchar(&v, &outpos, ch) < 0)
6266 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 }
6268
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006269 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006270 goto onError;
6271 Py_XDECREF(errorHandler);
6272 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006273 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006276 Py_XDECREF(v);
6277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
6279 return NULL;
6280}
6281
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282/* --- Latin-1 Codec ------------------------------------------------------ */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006290 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291}
6292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294static void
6295make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006297 PyObject *unicode,
6298 Py_ssize_t startpos, Py_ssize_t endpos,
6299 const char *reason)
6300{
6301 if (*exceptionObject == NULL) {
6302 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006304 encoding, unicode, startpos, endpos, reason);
6305 }
6306 else {
6307 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6308 goto onError;
6309 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6310 goto onError;
6311 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6312 goto onError;
6313 return;
6314 onError:
6315 Py_DECREF(*exceptionObject);
6316 *exceptionObject = NULL;
6317 }
6318}
6319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006320/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321static void
6322raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006324 PyObject *unicode,
6325 Py_ssize_t startpos, Py_ssize_t endpos,
6326 const char *reason)
6327{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006328 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006329 encoding, unicode, startpos, endpos, reason);
6330 if (*exceptionObject != NULL)
6331 PyCodec_StrictErrors(*exceptionObject);
6332}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006333
6334/* error handling callback helper:
6335 build arguments, call the callback and check the arguments,
6336 put the result into newpos and return the replacement string, which
6337 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338static PyObject *
6339unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 PyObject **errorHandler,
6341 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006343 Py_ssize_t startpos, Py_ssize_t endpos,
6344 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006346 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 PyObject *restuple;
6349 PyObject *resunicode;
6350
6351 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 }
6356
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 if (PyUnicode_READY(unicode) < 0)
6358 return NULL;
6359 len = PyUnicode_GET_LENGTH(unicode);
6360
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006361 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365
6366 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006371 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 Py_DECREF(restuple);
6373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006375 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 &resunicode, newpos)) {
6377 Py_DECREF(restuple);
6378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006380 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6381 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6382 Py_DECREF(restuple);
6383 return NULL;
6384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006386 *newpos = len + *newpos;
6387 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6389 Py_DECREF(restuple);
6390 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 Py_INCREF(resunicode);
6393 Py_DECREF(restuple);
6394 return resunicode;
6395}
6396
Alexander Belopolsky40018472011-02-26 01:02:56 +00006397static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006398unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006399 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006400 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 /* input state */
6403 Py_ssize_t pos=0, size;
6404 int kind;
6405 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 /* output object */
6407 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 /* pointer into the output */
6409 char *str;
6410 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006411 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006412 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6413 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 PyObject *errorHandler = NULL;
6415 PyObject *exc = NULL;
6416 /* the following variable is used for caching string comparisons
6417 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6418 int known_errorHandler = -1;
6419
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 if (PyUnicode_READY(unicode) < 0)
6421 return NULL;
6422 size = PyUnicode_GET_LENGTH(unicode);
6423 kind = PyUnicode_KIND(unicode);
6424 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 /* allocate enough for a simple encoding without
6426 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006427 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006428 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006429 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006431 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006432 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 ressize = size;
6434
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 while (pos < size) {
6436 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 /* can we encode this? */
6439 if (c<limit) {
6440 /* no overflow check, because we know that the space is enough */
6441 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 Py_ssize_t requiredsize;
6446 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006449 Py_ssize_t collstart = pos;
6450 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 ++collend;
6454 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6455 if (known_errorHandler==-1) {
6456 if ((errors==NULL) || (!strcmp(errors, "strict")))
6457 known_errorHandler = 1;
6458 else if (!strcmp(errors, "replace"))
6459 known_errorHandler = 2;
6460 else if (!strcmp(errors, "ignore"))
6461 known_errorHandler = 3;
6462 else if (!strcmp(errors, "xmlcharrefreplace"))
6463 known_errorHandler = 4;
6464 else
6465 known_errorHandler = 0;
6466 }
6467 switch (known_errorHandler) {
6468 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006469 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 goto onError;
6471 case 2: /* replace */
6472 while (collstart++<collend)
6473 *str++ = '?'; /* fall through */
6474 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 break;
6477 case 4: /* xmlcharrefreplace */
6478 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 /* determine replacement size */
6480 for (i = collstart, repsize = 0; i < collend; ++i) {
6481 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6482 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006486 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006490 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006492 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006494 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006495 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006497 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (requiredsize > ressize) {
6501 if (requiredsize<2*ressize)
6502 requiredsize = 2*ressize;
6503 if (_PyBytes_Resize(&res, requiredsize))
6504 goto onError;
6505 str = PyBytes_AS_STRING(res) + respos;
6506 ressize = requiredsize;
6507 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 /* generate replacement */
6509 for (i = collstart; i < collend; ++i) {
6510 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 break;
6514 default:
6515 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 encoding, reason, unicode, &exc,
6517 collstart, collend, &newpos);
6518 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6519 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006521 if (PyBytes_Check(repunicode)) {
6522 /* Directly copy bytes result to output. */
6523 repsize = PyBytes_Size(repunicode);
6524 if (repsize > 1) {
6525 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006526 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006527 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6528 Py_DECREF(repunicode);
6529 goto onError;
6530 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006531 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006532 ressize += repsize-1;
6533 }
6534 memcpy(str, PyBytes_AsString(repunicode), repsize);
6535 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006536 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006537 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006538 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 /* need more space? (at least enough for what we
6541 have+the replacement+the rest of the string, so
6542 we won't have to check space for encodable characters) */
6543 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 repsize = PyUnicode_GET_LENGTH(repunicode);
6545 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 if (requiredsize > ressize) {
6547 if (requiredsize<2*ressize)
6548 requiredsize = 2*ressize;
6549 if (_PyBytes_Resize(&res, requiredsize)) {
6550 Py_DECREF(repunicode);
6551 goto onError;
6552 }
6553 str = PyBytes_AS_STRING(res) + respos;
6554 ressize = requiredsize;
6555 }
6556 /* check if there is anything unencodable in the replacement
6557 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 for (i = 0; repsize-->0; ++i, ++str) {
6559 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006561 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 Py_DECREF(repunicode);
6564 goto onError;
6565 }
6566 *str = (char)c;
6567 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006568 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006569 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006570 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006571 }
6572 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006573 /* Resize if we allocated to much */
6574 size = str - PyBytes_AS_STRING(res);
6575 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006576 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006577 if (_PyBytes_Resize(&res, size) < 0)
6578 goto onError;
6579 }
6580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006581 Py_XDECREF(errorHandler);
6582 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006583 return res;
6584
6585 onError:
6586 Py_XDECREF(res);
6587 Py_XDECREF(errorHandler);
6588 Py_XDECREF(exc);
6589 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590}
6591
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006592/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006593PyObject *
6594PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006595 Py_ssize_t size,
6596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006598 PyObject *result;
6599 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6600 if (unicode == NULL)
6601 return NULL;
6602 result = unicode_encode_ucs1(unicode, errors, 256);
6603 Py_DECREF(unicode);
6604 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605}
6606
Alexander Belopolsky40018472011-02-26 01:02:56 +00006607PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006608_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609{
6610 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 PyErr_BadArgument();
6612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006614 if (PyUnicode_READY(unicode) == -1)
6615 return NULL;
6616 /* Fast path: if it is a one-byte string, construct
6617 bytes object directly. */
6618 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6619 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6620 PyUnicode_GET_LENGTH(unicode));
6621 /* Non-Latin-1 characters present. Defer to above function to
6622 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006624}
6625
6626PyObject*
6627PyUnicode_AsLatin1String(PyObject *unicode)
6628{
6629 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
6632/* --- 7-bit ASCII Codec -------------------------------------------------- */
6633
Alexander Belopolsky40018472011-02-26 01:02:56 +00006634PyObject *
6635PyUnicode_DecodeASCII(const char *s,
6636 Py_ssize_t size,
6637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006640 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006641 int kind;
6642 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006643 Py_ssize_t startinpos;
6644 Py_ssize_t endinpos;
6645 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006646 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006647 int has_error;
6648 const unsigned char *p = (const unsigned char *)s;
6649 const unsigned char *end = p + size;
6650 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651 PyObject *errorHandler = NULL;
6652 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006653
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006654 if (size == 0) {
6655 Py_INCREF(unicode_empty);
6656 return unicode_empty;
6657 }
6658
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006660 if (size == 1 && (unsigned char)s[0] < 128)
6661 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006662
Victor Stinner702c7342011-10-05 13:50:52 +02006663 has_error = 0;
6664 while (p < end && !has_error) {
6665 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6666 an explanation. */
6667 if (!((size_t) p & LONG_PTR_MASK)) {
6668 /* Help register allocation */
6669 register const unsigned char *_p = p;
6670 while (_p < aligned_end) {
6671 unsigned long value = *(unsigned long *) _p;
6672 if (value & ASCII_CHAR_MASK) {
6673 has_error = 1;
6674 break;
6675 }
6676 _p += SIZEOF_LONG;
6677 }
6678 if (_p == end)
6679 break;
6680 if (has_error)
6681 break;
6682 p = _p;
6683 }
6684 if (*p & 0x80) {
6685 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006687 }
6688 else {
6689 ++p;
6690 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006691 }
Victor Stinner702c7342011-10-05 13:50:52 +02006692 if (!has_error)
6693 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006694
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006695 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006699 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006700 kind = PyUnicode_KIND(v);
6701 data = PyUnicode_DATA(v);
6702 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 e = s + size;
6704 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 register unsigned char c = (unsigned char)*s;
6706 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006707 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 ++s;
6709 }
6710 else {
6711 startinpos = s-starts;
6712 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 if (unicode_decode_call_errorhandler(
6714 errors, &errorHandler,
6715 "ascii", "ordinal not in range(128)",
6716 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006717 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006719 kind = PyUnicode_KIND(v);
6720 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006723 if (PyUnicode_Resize(&v, outpos) < 0)
6724 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 Py_XDECREF(errorHandler);
6726 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006727 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006728 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006729
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 Py_XDECREF(errorHandler);
6733 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 return NULL;
6735}
6736
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006737/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006738PyObject *
6739PyUnicode_EncodeASCII(const Py_UNICODE *p,
6740 Py_ssize_t size,
6741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006743 PyObject *result;
6744 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6745 if (unicode == NULL)
6746 return NULL;
6747 result = unicode_encode_ucs1(unicode, errors, 128);
6748 Py_DECREF(unicode);
6749 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750}
6751
Alexander Belopolsky40018472011-02-26 01:02:56 +00006752PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006753_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754{
6755 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 PyErr_BadArgument();
6757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006759 if (PyUnicode_READY(unicode) == -1)
6760 return NULL;
6761 /* Fast path: if it is an ASCII-only string, construct bytes object
6762 directly. Else defer to above function to raise the exception. */
6763 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6764 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6765 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006766 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006767}
6768
6769PyObject *
6770PyUnicode_AsASCIIString(PyObject *unicode)
6771{
6772 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773}
6774
Victor Stinner99b95382011-07-04 14:23:54 +02006775#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006776
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006777/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006778
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006779#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006780#define NEED_RETRY
6781#endif
6782
Victor Stinner3a50e702011-10-18 21:21:00 +02006783#ifndef WC_ERR_INVALID_CHARS
6784# define WC_ERR_INVALID_CHARS 0x0080
6785#endif
6786
6787static char*
6788code_page_name(UINT code_page, PyObject **obj)
6789{
6790 *obj = NULL;
6791 if (code_page == CP_ACP)
6792 return "mbcs";
6793 if (code_page == CP_UTF7)
6794 return "CP_UTF7";
6795 if (code_page == CP_UTF8)
6796 return "CP_UTF8";
6797
6798 *obj = PyBytes_FromFormat("cp%u", code_page);
6799 if (*obj == NULL)
6800 return NULL;
6801 return PyBytes_AS_STRING(*obj);
6802}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006803
Alexander Belopolsky40018472011-02-26 01:02:56 +00006804static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006805is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806{
6807 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006808 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006809
Victor Stinner3a50e702011-10-18 21:21:00 +02006810 if (!IsDBCSLeadByteEx(code_page, *curr))
6811 return 0;
6812
6813 prev = CharPrevExA(code_page, s, curr, 0);
6814 if (prev == curr)
6815 return 1;
6816 /* FIXME: This code is limited to "true" double-byte encodings,
6817 as it assumes an incomplete character consists of a single
6818 byte. */
6819 if (curr - prev == 2)
6820 return 1;
6821 if (!IsDBCSLeadByteEx(code_page, *prev))
6822 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006823 return 0;
6824}
6825
Victor Stinner3a50e702011-10-18 21:21:00 +02006826static DWORD
6827decode_code_page_flags(UINT code_page)
6828{
6829 if (code_page == CP_UTF7) {
6830 /* The CP_UTF7 decoder only supports flags=0 */
6831 return 0;
6832 }
6833 else
6834 return MB_ERR_INVALID_CHARS;
6835}
6836
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 * Decode a byte string from a Windows code page into unicode object in strict
6839 * mode.
6840 *
6841 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6842 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006844static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006845decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006846 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006847 const char *in,
6848 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006849{
Victor Stinner3a50e702011-10-18 21:21:00 +02006850 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006851 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006852 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 assert(insize > 0);
6856 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6857 if (outsize <= 0)
6858 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859
6860 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006862 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 if (*v == NULL)
6864 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006865 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006866 }
6867 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006870 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873 }
6874
6875 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6877 if (outsize <= 0)
6878 goto error;
6879 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006880
Victor Stinner3a50e702011-10-18 21:21:00 +02006881error:
6882 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6883 return -2;
6884 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886}
6887
Victor Stinner3a50e702011-10-18 21:21:00 +02006888/*
6889 * Decode a byte string from a code page into unicode object with an error
6890 * handler.
6891 *
6892 * Returns consumed size if succeed, or raise a WindowsError or
6893 * UnicodeDecodeError exception and returns -1 on error.
6894 */
6895static int
6896decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006897 PyObject **v,
6898 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 const char *errors)
6900{
6901 const char *startin = in;
6902 const char *endin = in + size;
6903 const DWORD flags = decode_code_page_flags(code_page);
6904 /* Ideally, we should get reason from FormatMessage. This is the Windows
6905 2000 English version of the message. */
6906 const char *reason = "No mapping for the Unicode character exists "
6907 "in the target code page.";
6908 /* each step cannot decode more than 1 character, but a character can be
6909 represented as a surrogate pair */
6910 wchar_t buffer[2], *startout, *out;
6911 int insize, outsize;
6912 PyObject *errorHandler = NULL;
6913 PyObject *exc = NULL;
6914 PyObject *encoding_obj = NULL;
6915 char *encoding;
6916 DWORD err;
6917 int ret = -1;
6918
6919 assert(size > 0);
6920
6921 encoding = code_page_name(code_page, &encoding_obj);
6922 if (encoding == NULL)
6923 return -1;
6924
6925 if (errors == NULL || strcmp(errors, "strict") == 0) {
6926 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6927 UnicodeDecodeError. */
6928 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6929 if (exc != NULL) {
6930 PyCodec_StrictErrors(exc);
6931 Py_CLEAR(exc);
6932 }
6933 goto error;
6934 }
6935
6936 if (*v == NULL) {
6937 /* Create unicode object */
6938 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6939 PyErr_NoMemory();
6940 goto error;
6941 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006942 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006943 if (*v == NULL)
6944 goto error;
6945 startout = PyUnicode_AS_UNICODE(*v);
6946 }
6947 else {
6948 /* Extend unicode object */
6949 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6950 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6951 PyErr_NoMemory();
6952 goto error;
6953 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006954 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 goto error;
6956 startout = PyUnicode_AS_UNICODE(*v) + n;
6957 }
6958
6959 /* Decode the byte string character per character */
6960 out = startout;
6961 while (in < endin)
6962 {
6963 /* Decode a character */
6964 insize = 1;
6965 do
6966 {
6967 outsize = MultiByteToWideChar(code_page, flags,
6968 in, insize,
6969 buffer, Py_ARRAY_LENGTH(buffer));
6970 if (outsize > 0)
6971 break;
6972 err = GetLastError();
6973 if (err != ERROR_NO_UNICODE_TRANSLATION
6974 && err != ERROR_INSUFFICIENT_BUFFER)
6975 {
6976 PyErr_SetFromWindowsErr(0);
6977 goto error;
6978 }
6979 insize++;
6980 }
6981 /* 4=maximum length of a UTF-8 sequence */
6982 while (insize <= 4 && (in + insize) <= endin);
6983
6984 if (outsize <= 0) {
6985 Py_ssize_t startinpos, endinpos, outpos;
6986
6987 startinpos = in - startin;
6988 endinpos = startinpos + 1;
6989 outpos = out - PyUnicode_AS_UNICODE(*v);
6990 if (unicode_decode_call_errorhandler(
6991 errors, &errorHandler,
6992 encoding, reason,
6993 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006994 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006995 {
6996 goto error;
6997 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006998 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 }
7000 else {
7001 in += insize;
7002 memcpy(out, buffer, outsize * sizeof(wchar_t));
7003 out += outsize;
7004 }
7005 }
7006
7007 /* write a NUL character at the end */
7008 *out = 0;
7009
7010 /* Extend unicode object */
7011 outsize = out - startout;
7012 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007013 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007015 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007016
7017error:
7018 Py_XDECREF(encoding_obj);
7019 Py_XDECREF(errorHandler);
7020 Py_XDECREF(exc);
7021 return ret;
7022}
7023
Victor Stinner3a50e702011-10-18 21:21:00 +02007024static PyObject *
7025decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007026 const char *s, Py_ssize_t size,
7027 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028{
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 PyObject *v = NULL;
7030 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031
Victor Stinner3a50e702011-10-18 21:21:00 +02007032 if (code_page < 0) {
7033 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7034 return NULL;
7035 }
7036
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039
Victor Stinner76a31a62011-11-04 00:05:13 +01007040 do
7041 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007042#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007043 if (size > INT_MAX) {
7044 chunk_size = INT_MAX;
7045 final = 0;
7046 done = 0;
7047 }
7048 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007049#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007050 {
7051 chunk_size = (int)size;
7052 final = (consumed == NULL);
7053 done = 1;
7054 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007055
Victor Stinner76a31a62011-11-04 00:05:13 +01007056 /* Skip trailing lead-byte unless 'final' is set */
7057 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7058 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 if (chunk_size == 0 && done) {
7061 if (v != NULL)
7062 break;
7063 Py_INCREF(unicode_empty);
7064 return unicode_empty;
7065 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066
Victor Stinner76a31a62011-11-04 00:05:13 +01007067
7068 converted = decode_code_page_strict(code_page, &v,
7069 s, chunk_size);
7070 if (converted == -2)
7071 converted = decode_code_page_errors(code_page, &v,
7072 s, chunk_size,
7073 errors);
7074 assert(converted != 0);
7075
7076 if (converted < 0) {
7077 Py_XDECREF(v);
7078 return NULL;
7079 }
7080
7081 if (consumed)
7082 *consumed += converted;
7083
7084 s += converted;
7085 size -= converted;
7086 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007087
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007088 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089}
7090
Alexander Belopolsky40018472011-02-26 01:02:56 +00007091PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007092PyUnicode_DecodeCodePageStateful(int code_page,
7093 const char *s,
7094 Py_ssize_t size,
7095 const char *errors,
7096 Py_ssize_t *consumed)
7097{
7098 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7099}
7100
7101PyObject *
7102PyUnicode_DecodeMBCSStateful(const char *s,
7103 Py_ssize_t size,
7104 const char *errors,
7105 Py_ssize_t *consumed)
7106{
7107 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7108}
7109
7110PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007111PyUnicode_DecodeMBCS(const char *s,
7112 Py_ssize_t size,
7113 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007114{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7116}
7117
Victor Stinner3a50e702011-10-18 21:21:00 +02007118static DWORD
7119encode_code_page_flags(UINT code_page, const char *errors)
7120{
7121 if (code_page == CP_UTF8) {
7122 if (winver.dwMajorVersion >= 6)
7123 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7124 and later */
7125 return WC_ERR_INVALID_CHARS;
7126 else
7127 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7128 return 0;
7129 }
7130 else if (code_page == CP_UTF7) {
7131 /* CP_UTF7 only supports flags=0 */
7132 return 0;
7133 }
7134 else {
7135 if (errors != NULL && strcmp(errors, "replace") == 0)
7136 return 0;
7137 else
7138 return WC_NO_BEST_FIT_CHARS;
7139 }
7140}
7141
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 * Encode a Unicode string to a Windows code page into a byte string in strict
7144 * mode.
7145 *
7146 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7147 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007149static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007150encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007151 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153{
Victor Stinner554f3f02010-06-16 23:33:54 +00007154 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 BOOL *pusedDefaultChar = &usedDefaultChar;
7156 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007157 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007158 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007159 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 const DWORD flags = encode_code_page_flags(code_page, NULL);
7161 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007162 /* Create a substring so that we can get the UTF-16 representation
7163 of just the slice under consideration. */
7164 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165
Martin v. Löwis3d325192011-11-04 18:23:06 +01007166 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007167
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007169 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007171 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007172
Victor Stinner2fc507f2011-11-04 20:06:39 +01007173 substring = PyUnicode_Substring(unicode, offset, offset+len);
7174 if (substring == NULL)
7175 return -1;
7176 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7177 if (p == NULL) {
7178 Py_DECREF(substring);
7179 return -1;
7180 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007181
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007182 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 outsize = WideCharToMultiByte(code_page, flags,
7184 p, size,
7185 NULL, 0,
7186 NULL, pusedDefaultChar);
7187 if (outsize <= 0)
7188 goto error;
7189 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 if (pusedDefaultChar && *pusedDefaultChar) {
7191 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007193 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007194
Victor Stinner3a50e702011-10-18 21:21:00 +02007195 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007198 if (*outbytes == NULL) {
7199 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007201 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203 }
7204 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 const Py_ssize_t n = PyBytes_Size(*outbytes);
7207 if (outsize > PY_SSIZE_T_MAX - n) {
7208 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7213 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007215 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217 }
7218
7219 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 outsize = WideCharToMultiByte(code_page, flags,
7221 p, size,
7222 out, outsize,
7223 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007224 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 if (outsize <= 0)
7226 goto error;
7227 if (pusedDefaultChar && *pusedDefaultChar)
7228 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007230
Victor Stinner3a50e702011-10-18 21:21:00 +02007231error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007232 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7234 return -2;
7235 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007236 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007237}
7238
Victor Stinner3a50e702011-10-18 21:21:00 +02007239/*
7240 * Encode a Unicode string to a Windows code page into a byte string using a
7241 * error handler.
7242 *
7243 * Returns consumed characters if succeed, or raise a WindowsError and returns
7244 * -1 on other error.
7245 */
7246static int
7247encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007248 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007249 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007250{
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 Py_ssize_t pos = unicode_offset;
7253 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 /* Ideally, we should get reason from FormatMessage. This is the Windows
7255 2000 English version of the message. */
7256 const char *reason = "invalid character";
7257 /* 4=maximum length of a UTF-8 sequence */
7258 char buffer[4];
7259 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7260 Py_ssize_t outsize;
7261 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 PyObject *errorHandler = NULL;
7263 PyObject *exc = NULL;
7264 PyObject *encoding_obj = NULL;
7265 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007266 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 PyObject *rep;
7268 int ret = -1;
7269
7270 assert(insize > 0);
7271
7272 encoding = code_page_name(code_page, &encoding_obj);
7273 if (encoding == NULL)
7274 return -1;
7275
7276 if (errors == NULL || strcmp(errors, "strict") == 0) {
7277 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7278 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007279 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 if (exc != NULL) {
7281 PyCodec_StrictErrors(exc);
7282 Py_DECREF(exc);
7283 }
7284 Py_XDECREF(encoding_obj);
7285 return -1;
7286 }
7287
7288 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7289 pusedDefaultChar = &usedDefaultChar;
7290 else
7291 pusedDefaultChar = NULL;
7292
7293 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7294 PyErr_NoMemory();
7295 goto error;
7296 }
7297 outsize = insize * Py_ARRAY_LENGTH(buffer);
7298
7299 if (*outbytes == NULL) {
7300 /* Create string object */
7301 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7302 if (*outbytes == NULL)
7303 goto error;
7304 out = PyBytes_AS_STRING(*outbytes);
7305 }
7306 else {
7307 /* Extend string object */
7308 Py_ssize_t n = PyBytes_Size(*outbytes);
7309 if (n > PY_SSIZE_T_MAX - outsize) {
7310 PyErr_NoMemory();
7311 goto error;
7312 }
7313 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7314 goto error;
7315 out = PyBytes_AS_STRING(*outbytes) + n;
7316 }
7317
7318 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007319 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007321 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7322 wchar_t chars[2];
7323 int charsize;
7324 if (ch < 0x10000) {
7325 chars[0] = (wchar_t)ch;
7326 charsize = 1;
7327 }
7328 else {
7329 ch -= 0x10000;
7330 chars[0] = 0xd800 + (ch >> 10);
7331 chars[1] = 0xdc00 + (ch & 0x3ff);
7332 charsize = 2;
7333 }
7334
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007336 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 buffer, Py_ARRAY_LENGTH(buffer),
7338 NULL, pusedDefaultChar);
7339 if (outsize > 0) {
7340 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7341 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007342 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 memcpy(out, buffer, outsize);
7344 out += outsize;
7345 continue;
7346 }
7347 }
7348 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7349 PyErr_SetFromWindowsErr(0);
7350 goto error;
7351 }
7352
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 rep = unicode_encode_call_errorhandler(
7354 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007355 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007356 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 if (rep == NULL)
7358 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007359 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007360
7361 if (PyBytes_Check(rep)) {
7362 outsize = PyBytes_GET_SIZE(rep);
7363 if (outsize != 1) {
7364 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7365 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7366 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7367 Py_DECREF(rep);
7368 goto error;
7369 }
7370 out = PyBytes_AS_STRING(*outbytes) + offset;
7371 }
7372 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7373 out += outsize;
7374 }
7375 else {
7376 Py_ssize_t i;
7377 enum PyUnicode_Kind kind;
7378 void *data;
7379
7380 if (PyUnicode_READY(rep) < 0) {
7381 Py_DECREF(rep);
7382 goto error;
7383 }
7384
7385 outsize = PyUnicode_GET_LENGTH(rep);
7386 if (outsize != 1) {
7387 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7388 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7389 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7390 Py_DECREF(rep);
7391 goto error;
7392 }
7393 out = PyBytes_AS_STRING(*outbytes) + offset;
7394 }
7395 kind = PyUnicode_KIND(rep);
7396 data = PyUnicode_DATA(rep);
7397 for (i=0; i < outsize; i++) {
7398 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7399 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007400 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007401 encoding, unicode,
7402 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 "unable to encode error handler result to ASCII");
7404 Py_DECREF(rep);
7405 goto error;
7406 }
7407 *out = (unsigned char)ch;
7408 out++;
7409 }
7410 }
7411 Py_DECREF(rep);
7412 }
7413 /* write a NUL byte */
7414 *out = 0;
7415 outsize = out - PyBytes_AS_STRING(*outbytes);
7416 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7417 if (_PyBytes_Resize(outbytes, outsize) < 0)
7418 goto error;
7419 ret = 0;
7420
7421error:
7422 Py_XDECREF(encoding_obj);
7423 Py_XDECREF(errorHandler);
7424 Py_XDECREF(exc);
7425 return ret;
7426}
7427
Victor Stinner3a50e702011-10-18 21:21:00 +02007428static PyObject *
7429encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 const char *errors)
7432{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007433 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007435 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007436 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007437
Victor Stinner2fc507f2011-11-04 20:06:39 +01007438 if (PyUnicode_READY(unicode) < 0)
7439 return NULL;
7440 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007441
Victor Stinner3a50e702011-10-18 21:21:00 +02007442 if (code_page < 0) {
7443 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7444 return NULL;
7445 }
7446
Martin v. Löwis3d325192011-11-04 18:23:06 +01007447 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 return PyBytes_FromStringAndSize(NULL, 0);
7449
Victor Stinner7581cef2011-11-03 22:32:33 +01007450 offset = 0;
7451 do
7452 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007454 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455 chunks. */
7456 if (len > INT_MAX/2) {
7457 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007458 done = 0;
7459 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007460 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007464 done = 1;
7465 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007466
Victor Stinner76a31a62011-11-04 00:05:13 +01007467 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007468 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007469 errors);
7470 if (ret == -2)
7471 ret = encode_code_page_errors(code_page, &outbytes,
7472 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007473 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007474 if (ret < 0) {
7475 Py_XDECREF(outbytes);
7476 return NULL;
7477 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007480 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 return outbytes;
7484}
7485
7486PyObject *
7487PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7488 Py_ssize_t size,
7489 const char *errors)
7490{
Victor Stinner7581cef2011-11-03 22:32:33 +01007491 PyObject *unicode, *res;
7492 unicode = PyUnicode_FromUnicode(p, size);
7493 if (unicode == NULL)
7494 return NULL;
7495 res = encode_code_page(CP_ACP, unicode, errors);
7496 Py_DECREF(unicode);
7497 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007498}
7499
7500PyObject *
7501PyUnicode_EncodeCodePage(int code_page,
7502 PyObject *unicode,
7503 const char *errors)
7504{
Victor Stinner7581cef2011-11-03 22:32:33 +01007505 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007506}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007507
Alexander Belopolsky40018472011-02-26 01:02:56 +00007508PyObject *
7509PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007510{
7511 if (!PyUnicode_Check(unicode)) {
7512 PyErr_BadArgument();
7513 return NULL;
7514 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007515 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007516}
7517
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007518#undef NEED_RETRY
7519
Victor Stinner99b95382011-07-04 14:23:54 +02007520#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007521
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522/* --- Character Mapping Codec -------------------------------------------- */
7523
Alexander Belopolsky40018472011-02-26 01:02:56 +00007524PyObject *
7525PyUnicode_DecodeCharmap(const char *s,
7526 Py_ssize_t size,
7527 PyObject *mapping,
7528 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007531 Py_ssize_t startinpos;
7532 Py_ssize_t endinpos;
7533 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007535 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 PyObject *errorHandler = NULL;
7538 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007539
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 /* Default to Latin-1 */
7541 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007544 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007548 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007549 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007551 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007552 Py_ssize_t maplen;
7553 enum PyUnicode_Kind kind;
7554 void *data;
7555 Py_UCS4 x;
7556
7557 if (PyUnicode_READY(mapping) < 0)
7558 return NULL;
7559
7560 maplen = PyUnicode_GET_LENGTH(mapping);
7561 data = PyUnicode_DATA(mapping);
7562 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 while (s < e) {
7564 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007567 x = PyUnicode_READ(kind, data, ch);
7568 else
7569 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007571 if (x == 0xfffe)
7572 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 startinpos = s-starts;
7575 endinpos = startinpos+1;
7576 if (unicode_decode_call_errorhandler(
7577 errors, &errorHandler,
7578 "charmap", "character maps to <undefined>",
7579 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007580 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 goto onError;
7582 }
7583 continue;
7584 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007585
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007586 if (unicode_putchar(&v, &outpos, x) < 0)
7587 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007589 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007590 }
7591 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 while (s < e) {
7593 unsigned char ch = *s;
7594 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007595
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7597 w = PyLong_FromLong((long)ch);
7598 if (w == NULL)
7599 goto onError;
7600 x = PyObject_GetItem(mapping, w);
7601 Py_DECREF(w);
7602 if (x == NULL) {
7603 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7604 /* No mapping found means: mapping is undefined. */
7605 PyErr_Clear();
7606 x = Py_None;
7607 Py_INCREF(x);
7608 } else
7609 goto onError;
7610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 /* Apply mapping */
7613 if (PyLong_Check(x)) {
7614 long value = PyLong_AS_LONG(x);
7615 if (value < 0 || value > 65535) {
7616 PyErr_SetString(PyExc_TypeError,
7617 "character mapping must be in range(65536)");
7618 Py_DECREF(x);
7619 goto onError;
7620 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007621 if (unicode_putchar(&v, &outpos, value) < 0)
7622 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 }
7624 else if (x == Py_None) {
7625 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 startinpos = s-starts;
7627 endinpos = startinpos+1;
7628 if (unicode_decode_call_errorhandler(
7629 errors, &errorHandler,
7630 "charmap", "character maps to <undefined>",
7631 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007632 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 Py_DECREF(x);
7634 goto onError;
7635 }
7636 Py_DECREF(x);
7637 continue;
7638 }
7639 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007640 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007641
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007642 if (PyUnicode_READY(x) < 0)
7643 goto onError;
7644 targetsize = PyUnicode_GET_LENGTH(x);
7645
7646 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007648 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007649 PyUnicode_READ_CHAR(x, 0)) < 0)
7650 goto onError;
7651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 else if (targetsize > 1) {
7653 /* 1-n mapping */
7654 if (targetsize > extrachars) {
7655 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 Py_ssize_t needed = (targetsize - extrachars) + \
7657 (targetsize << 2);
7658 extrachars += needed;
7659 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007660 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007661 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 Py_DECREF(x);
7663 goto onError;
7664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007666 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7667 goto onError;
7668 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7669 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 extrachars -= targetsize;
7671 }
7672 /* 1-0 mapping: skip the character */
7673 }
7674 else {
7675 /* wrong return value */
7676 PyErr_SetString(PyExc_TypeError,
7677 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007678 Py_DECREF(x);
7679 goto onError;
7680 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 Py_DECREF(x);
7682 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007685 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007687 Py_XDECREF(errorHandler);
7688 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007689 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007690
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007692 Py_XDECREF(errorHandler);
7693 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 Py_XDECREF(v);
7695 return NULL;
7696}
7697
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698/* Charmap encoding: the lookup table */
7699
Alexander Belopolsky40018472011-02-26 01:02:56 +00007700struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 PyObject_HEAD
7702 unsigned char level1[32];
7703 int count2, count3;
7704 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705};
7706
7707static PyObject*
7708encoding_map_size(PyObject *obj, PyObject* args)
7709{
7710 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007711 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713}
7714
7715static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 PyDoc_STR("Return the size (in bytes) of this object") },
7718 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719};
7720
7721static void
7722encoding_map_dealloc(PyObject* o)
7723{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007724 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725}
7726
7727static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007728 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 "EncodingMap", /*tp_name*/
7730 sizeof(struct encoding_map), /*tp_basicsize*/
7731 0, /*tp_itemsize*/
7732 /* methods */
7733 encoding_map_dealloc, /*tp_dealloc*/
7734 0, /*tp_print*/
7735 0, /*tp_getattr*/
7736 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007737 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 0, /*tp_repr*/
7739 0, /*tp_as_number*/
7740 0, /*tp_as_sequence*/
7741 0, /*tp_as_mapping*/
7742 0, /*tp_hash*/
7743 0, /*tp_call*/
7744 0, /*tp_str*/
7745 0, /*tp_getattro*/
7746 0, /*tp_setattro*/
7747 0, /*tp_as_buffer*/
7748 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7749 0, /*tp_doc*/
7750 0, /*tp_traverse*/
7751 0, /*tp_clear*/
7752 0, /*tp_richcompare*/
7753 0, /*tp_weaklistoffset*/
7754 0, /*tp_iter*/
7755 0, /*tp_iternext*/
7756 encoding_map_methods, /*tp_methods*/
7757 0, /*tp_members*/
7758 0, /*tp_getset*/
7759 0, /*tp_base*/
7760 0, /*tp_dict*/
7761 0, /*tp_descr_get*/
7762 0, /*tp_descr_set*/
7763 0, /*tp_dictoffset*/
7764 0, /*tp_init*/
7765 0, /*tp_alloc*/
7766 0, /*tp_new*/
7767 0, /*tp_free*/
7768 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769};
7770
7771PyObject*
7772PyUnicode_BuildEncodingMap(PyObject* string)
7773{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774 PyObject *result;
7775 struct encoding_map *mresult;
7776 int i;
7777 int need_dict = 0;
7778 unsigned char level1[32];
7779 unsigned char level2[512];
7780 unsigned char *mlevel1, *mlevel2, *mlevel3;
7781 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007782 int kind;
7783 void *data;
7784 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007786 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787 PyErr_BadArgument();
7788 return NULL;
7789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 kind = PyUnicode_KIND(string);
7791 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007792 memset(level1, 0xFF, sizeof level1);
7793 memset(level2, 0xFF, sizeof level2);
7794
7795 /* If there isn't a one-to-one mapping of NULL to \0,
7796 or if there are non-BMP characters, we need to use
7797 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007798 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007799 need_dict = 1;
7800 for (i = 1; i < 256; i++) {
7801 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 ch = PyUnicode_READ(kind, data, i);
7803 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804 need_dict = 1;
7805 break;
7806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007807 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007808 /* unmapped character */
7809 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 l1 = ch >> 11;
7811 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 if (level1[l1] == 0xFF)
7813 level1[l1] = count2++;
7814 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007815 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 }
7817
7818 if (count2 >= 0xFF || count3 >= 0xFF)
7819 need_dict = 1;
7820
7821 if (need_dict) {
7822 PyObject *result = PyDict_New();
7823 PyObject *key, *value;
7824 if (!result)
7825 return NULL;
7826 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007828 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829 if (!key || !value)
7830 goto failed1;
7831 if (PyDict_SetItem(result, key, value) == -1)
7832 goto failed1;
7833 Py_DECREF(key);
7834 Py_DECREF(value);
7835 }
7836 return result;
7837 failed1:
7838 Py_XDECREF(key);
7839 Py_XDECREF(value);
7840 Py_DECREF(result);
7841 return NULL;
7842 }
7843
7844 /* Create a three-level trie */
7845 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7846 16*count2 + 128*count3 - 1);
7847 if (!result)
7848 return PyErr_NoMemory();
7849 PyObject_Init(result, &EncodingMapType);
7850 mresult = (struct encoding_map*)result;
7851 mresult->count2 = count2;
7852 mresult->count3 = count3;
7853 mlevel1 = mresult->level1;
7854 mlevel2 = mresult->level23;
7855 mlevel3 = mresult->level23 + 16*count2;
7856 memcpy(mlevel1, level1, 32);
7857 memset(mlevel2, 0xFF, 16*count2);
7858 memset(mlevel3, 0, 128*count3);
7859 count3 = 0;
7860 for (i = 1; i < 256; i++) {
7861 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863 /* unmapped character */
7864 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 o1 = PyUnicode_READ(kind, data, i)>>11;
7866 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007867 i2 = 16*mlevel1[o1] + o2;
7868 if (mlevel2[i2] == 0xFF)
7869 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 i3 = 128*mlevel2[i2] + o3;
7872 mlevel3[i3] = i;
7873 }
7874 return result;
7875}
7876
7877static int
Victor Stinner22168992011-11-20 17:09:18 +01007878encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879{
7880 struct encoding_map *map = (struct encoding_map*)mapping;
7881 int l1 = c>>11;
7882 int l2 = (c>>7) & 0xF;
7883 int l3 = c & 0x7F;
7884 int i;
7885
Victor Stinner22168992011-11-20 17:09:18 +01007886 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007888 if (c == 0)
7889 return 0;
7890 /* level 1*/
7891 i = map->level1[l1];
7892 if (i == 0xFF) {
7893 return -1;
7894 }
7895 /* level 2*/
7896 i = map->level23[16*i+l2];
7897 if (i == 0xFF) {
7898 return -1;
7899 }
7900 /* level 3 */
7901 i = map->level23[16*map->count2 + 128*i + l3];
7902 if (i == 0) {
7903 return -1;
7904 }
7905 return i;
7906}
7907
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007908/* Lookup the character ch in the mapping. If the character
7909 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007910 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007912charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913{
Christian Heimes217cfd12007-12-02 14:31:20 +00007914 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007915 PyObject *x;
7916
7917 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007919 x = PyObject_GetItem(mapping, w);
7920 Py_DECREF(w);
7921 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7923 /* No mapping found means: mapping is undefined. */
7924 PyErr_Clear();
7925 x = Py_None;
7926 Py_INCREF(x);
7927 return x;
7928 } else
7929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007931 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007933 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 long value = PyLong_AS_LONG(x);
7935 if (value < 0 || value > 255) {
7936 PyErr_SetString(PyExc_TypeError,
7937 "character mapping must be in range(256)");
7938 Py_DECREF(x);
7939 return NULL;
7940 }
7941 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007943 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007945 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 /* wrong return value */
7947 PyErr_Format(PyExc_TypeError,
7948 "character mapping must return integer, bytes or None, not %.400s",
7949 x->ob_type->tp_name);
7950 Py_DECREF(x);
7951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 }
7953}
7954
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007955static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007956charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007958 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7959 /* exponentially overallocate to minimize reallocations */
7960 if (requiredsize < 2*outsize)
7961 requiredsize = 2*outsize;
7962 if (_PyBytes_Resize(outobj, requiredsize))
7963 return -1;
7964 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007965}
7966
Benjamin Peterson14339b62009-01-31 16:36:08 +00007967typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007969} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007971 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 space is available. Return a new reference to the object that
7973 was put in the output buffer, or Py_None, if the mapping was undefined
7974 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007975 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007976static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007977charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007978 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007980 PyObject *rep;
7981 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007982 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007983
Christian Heimes90aa7642007-12-19 02:45:37 +00007984 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007987 if (res == -1)
7988 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 if (outsize<requiredsize)
7990 if (charmapencode_resize(outobj, outpos, requiredsize))
7991 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007992 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 outstart[(*outpos)++] = (char)res;
7994 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007995 }
7996
7997 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 Py_DECREF(rep);
8002 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008003 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 if (PyLong_Check(rep)) {
8005 Py_ssize_t requiredsize = *outpos+1;
8006 if (outsize<requiredsize)
8007 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8008 Py_DECREF(rep);
8009 return enc_EXCEPTION;
8010 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008011 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 else {
8015 const char *repchars = PyBytes_AS_STRING(rep);
8016 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8017 Py_ssize_t requiredsize = *outpos+repsize;
8018 if (outsize<requiredsize)
8019 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8020 Py_DECREF(rep);
8021 return enc_EXCEPTION;
8022 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008023 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 memcpy(outstart + *outpos, repchars, repsize);
8025 *outpos += repsize;
8026 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008027 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 Py_DECREF(rep);
8029 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030}
8031
8032/* handle an error in PyUnicode_EncodeCharmap
8033 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008034static int
8035charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008036 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008038 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008039 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040{
8041 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008043 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008044 enum PyUnicode_Kind kind;
8045 void *data;
8046 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008047 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008048 Py_ssize_t collstartpos = *inpos;
8049 Py_ssize_t collendpos = *inpos+1;
8050 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051 char *encoding = "charmap";
8052 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008054 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008055 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008056
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008057 if (PyUnicode_READY(unicode) < 0)
8058 return -1;
8059 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060 /* find all unencodable characters */
8061 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008063 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008064 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008065 val = encoding_map_lookup(ch, mapping);
8066 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 break;
8068 ++collendpos;
8069 continue;
8070 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008072 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8073 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 if (rep==NULL)
8075 return -1;
8076 else if (rep!=Py_None) {
8077 Py_DECREF(rep);
8078 break;
8079 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 }
8083 /* cache callback name lookup
8084 * (if not done yet, i.e. it's the first error) */
8085 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 if ((errors==NULL) || (!strcmp(errors, "strict")))
8087 *known_errorHandler = 1;
8088 else if (!strcmp(errors, "replace"))
8089 *known_errorHandler = 2;
8090 else if (!strcmp(errors, "ignore"))
8091 *known_errorHandler = 3;
8092 else if (!strcmp(errors, "xmlcharrefreplace"))
8093 *known_errorHandler = 4;
8094 else
8095 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 }
8097 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008099 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 return -1;
8101 case 2: /* replace */
8102 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 x = charmapencode_output('?', mapping, res, respos);
8104 if (x==enc_EXCEPTION) {
8105 return -1;
8106 }
8107 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008108 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return -1;
8110 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008111 }
8112 /* fall through */
8113 case 3: /* ignore */
8114 *inpos = collendpos;
8115 break;
8116 case 4: /* xmlcharrefreplace */
8117 /* generate replacement (temporarily (mis)uses p) */
8118 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 char buffer[2+29+1+1];
8120 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008121 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 for (cp = buffer; *cp; ++cp) {
8123 x = charmapencode_output(*cp, mapping, res, respos);
8124 if (x==enc_EXCEPTION)
8125 return -1;
8126 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008127 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 return -1;
8129 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 }
8131 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132 *inpos = collendpos;
8133 break;
8134 default:
8135 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008140 if (PyBytes_Check(repunicode)) {
8141 /* Directly copy bytes result to output. */
8142 Py_ssize_t outsize = PyBytes_Size(*res);
8143 Py_ssize_t requiredsize;
8144 repsize = PyBytes_Size(repunicode);
8145 requiredsize = *respos + repsize;
8146 if (requiredsize > outsize)
8147 /* Make room for all additional bytes. */
8148 if (charmapencode_resize(res, respos, requiredsize)) {
8149 Py_DECREF(repunicode);
8150 return -1;
8151 }
8152 memcpy(PyBytes_AsString(*res) + *respos,
8153 PyBytes_AsString(repunicode), repsize);
8154 *respos += repsize;
8155 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008156 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008157 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008158 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008160 if (PyUnicode_READY(repunicode) < 0) {
8161 Py_DECREF(repunicode);
8162 return -1;
8163 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008164 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008165 data = PyUnicode_DATA(repunicode);
8166 kind = PyUnicode_KIND(repunicode);
8167 for (index = 0; index < repsize; index++) {
8168 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8169 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008171 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 return -1;
8173 }
8174 else if (x==enc_FAILED) {
8175 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008176 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return -1;
8178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 }
8180 *inpos = newpos;
8181 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008182 }
8183 return 0;
8184}
8185
Alexander Belopolsky40018472011-02-26 01:02:56 +00008186PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008187_PyUnicode_EncodeCharmap(PyObject *unicode,
8188 PyObject *mapping,
8189 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008190{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191 /* output object */
8192 PyObject *res = NULL;
8193 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008194 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008195 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008197 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 PyObject *errorHandler = NULL;
8199 PyObject *exc = NULL;
8200 /* the following variable is used for caching string comparisons
8201 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8202 * 3=ignore, 4=xmlcharrefreplace */
8203 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008205 if (PyUnicode_READY(unicode) < 0)
8206 return NULL;
8207 size = PyUnicode_GET_LENGTH(unicode);
8208
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 /* Default to Latin-1 */
8210 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008211 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 /* allocate enough for a simple encoding without
8214 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008215 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216 if (res == NULL)
8217 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008218 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008222 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008224 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008225 if (x==enc_EXCEPTION) /* error */
8226 goto onError;
8227 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008228 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 &exc,
8230 &known_errorHandler, &errorHandler, errors,
8231 &res, &respos)) {
8232 goto onError;
8233 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 else
8236 /* done with this character => adjust input position */
8237 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008241 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008242 if (_PyBytes_Resize(&res, respos) < 0)
8243 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008244
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 Py_XDECREF(exc);
8246 Py_XDECREF(errorHandler);
8247 return res;
8248
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 Py_XDECREF(res);
8251 Py_XDECREF(exc);
8252 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253 return NULL;
8254}
8255
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256/* Deprecated */
8257PyObject *
8258PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8259 Py_ssize_t size,
8260 PyObject *mapping,
8261 const char *errors)
8262{
8263 PyObject *result;
8264 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8265 if (unicode == NULL)
8266 return NULL;
8267 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8268 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008269 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008270}
8271
Alexander Belopolsky40018472011-02-26 01:02:56 +00008272PyObject *
8273PyUnicode_AsCharmapString(PyObject *unicode,
8274 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275{
8276 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 PyErr_BadArgument();
8278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008280 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281}
8282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008284static void
8285make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008286 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287 Py_ssize_t startpos, Py_ssize_t endpos,
8288 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 *exceptionObject = _PyUnicodeTranslateError_Create(
8292 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
8294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008295 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8296 goto onError;
8297 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8298 goto onError;
8299 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8300 goto onError;
8301 return;
8302 onError:
8303 Py_DECREF(*exceptionObject);
8304 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
8306}
8307
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309static void
8310raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008312 Py_ssize_t startpos, Py_ssize_t endpos,
8313 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314{
8315 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319}
8320
8321/* error handling callback helper:
8322 build arguments, call the callback and check the arguments,
8323 put the result into newpos and return the replacement string, which
8324 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008325static PyObject *
8326unicode_translate_call_errorhandler(const char *errors,
8327 PyObject **errorHandler,
8328 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 Py_ssize_t startpos, Py_ssize_t endpos,
8331 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008333 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008335 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 PyObject *restuple;
8337 PyObject *resunicode;
8338
8339 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 }
8344
8345 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349
8350 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008355 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 Py_DECREF(restuple);
8357 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 }
8359 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 &resunicode, &i_newpos)) {
8361 Py_DECREF(restuple);
8362 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008364 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008366 else
8367 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8370 Py_DECREF(restuple);
8371 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008372 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 Py_INCREF(resunicode);
8374 Py_DECREF(restuple);
8375 return resunicode;
8376}
8377
8378/* Lookup the character ch in the mapping and put the result in result,
8379 which must be decrefed by the caller.
8380 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383{
Christian Heimes217cfd12007-12-02 14:31:20 +00008384 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject *x;
8386
8387 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 x = PyObject_GetItem(mapping, w);
8390 Py_DECREF(w);
8391 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8393 /* No mapping found means: use 1:1 mapping. */
8394 PyErr_Clear();
8395 *result = NULL;
8396 return 0;
8397 } else
8398 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 }
8400 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 *result = x;
8402 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008404 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 long value = PyLong_AS_LONG(x);
8406 long max = PyUnicode_GetMax();
8407 if (value < 0 || value > max) {
8408 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008409 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 Py_DECREF(x);
8411 return -1;
8412 }
8413 *result = x;
8414 return 0;
8415 }
8416 else if (PyUnicode_Check(x)) {
8417 *result = x;
8418 return 0;
8419 }
8420 else {
8421 /* wrong return value */
8422 PyErr_SetString(PyExc_TypeError,
8423 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008424 Py_DECREF(x);
8425 return -1;
8426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427}
8428/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 if not reallocate and adjust various state variables.
8430 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008431static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008436 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 /* exponentially overallocate to minimize reallocations */
8438 if (requiredsize < 2 * oldsize)
8439 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8441 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
8445 return 0;
8446}
8447/* lookup the character, put the result in the output string and adjust
8448 various state variables. Return a new reference to the object that
8449 was put in the output buffer in *result, or Py_None, if the mapping was
8450 undefined (in which case no character was written).
8451 The called must decref result.
8452 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8455 PyObject *mapping, Py_UCS4 **output,
8456 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008457 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8460 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 }
8466 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008468 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 }
8472 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t repsize;
8474 if (PyUnicode_READY(*res) == -1)
8475 return -1;
8476 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 if (repsize==1) {
8478 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 }
8481 else if (repsize!=0) {
8482 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483 Py_ssize_t requiredsize = *opos +
8484 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 Py_ssize_t i;
8487 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 for(i = 0; i < repsize; i++)
8490 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492 }
8493 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 return 0;
8496}
8497
Alexander Belopolsky40018472011-02-26 01:02:56 +00008498PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499_PyUnicode_TranslateCharmap(PyObject *input,
8500 PyObject *mapping,
8501 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 /* input object */
8504 char *idata;
8505 Py_ssize_t size, i;
8506 int kind;
8507 /* output buffer */
8508 Py_UCS4 *output = NULL;
8509 Py_ssize_t osize;
8510 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 char *reason = "character maps to <undefined>";
8514 PyObject *errorHandler = NULL;
8515 PyObject *exc = NULL;
8516 /* the following variable is used for caching string comparisons
8517 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8518 * 3=ignore, 4=xmlcharrefreplace */
8519 int known_errorHandler = -1;
8520
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 PyErr_BadArgument();
8523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 if (PyUnicode_READY(input) == -1)
8527 return NULL;
8528 idata = (char*)PyUnicode_DATA(input);
8529 kind = PyUnicode_KIND(input);
8530 size = PyUnicode_GET_LENGTH(input);
8531 i = 0;
8532
8533 if (size == 0) {
8534 Py_INCREF(input);
8535 return input;
8536 }
8537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 /* allocate enough for a simple 1:1 translation without
8539 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 osize = size;
8541 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8542 opos = 0;
8543 if (output == NULL) {
8544 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 /* try to encode it */
8550 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 if (charmaptranslate_output(input, i, mapping,
8552 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 Py_XDECREF(x);
8554 goto onError;
8555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 else { /* untranslatable character */
8560 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8561 Py_ssize_t repsize;
8562 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 Py_ssize_t collstart = i;
8566 Py_ssize_t collend = i+1;
8567 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 while (collend < size) {
8571 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 goto onError;
8573 Py_XDECREF(x);
8574 if (x!=Py_None)
8575 break;
8576 ++collend;
8577 }
8578 /* cache callback name lookup
8579 * (if not done yet, i.e. it's the first error) */
8580 if (known_errorHandler==-1) {
8581 if ((errors==NULL) || (!strcmp(errors, "strict")))
8582 known_errorHandler = 1;
8583 else if (!strcmp(errors, "replace"))
8584 known_errorHandler = 2;
8585 else if (!strcmp(errors, "ignore"))
8586 known_errorHandler = 3;
8587 else if (!strcmp(errors, "xmlcharrefreplace"))
8588 known_errorHandler = 4;
8589 else
8590 known_errorHandler = 0;
8591 }
8592 switch (known_errorHandler) {
8593 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 raise_translate_exception(&exc, input, collstart,
8595 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008596 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 case 2: /* replace */
8598 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 for (coll = collstart; coll<collend; coll++)
8600 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 /* fall through */
8602 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 break;
8605 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 /* generate replacement (temporarily (mis)uses i) */
8607 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 char buffer[2+29+1+1];
8609 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8611 if (charmaptranslate_makespace(&output, &osize,
8612 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 goto onError;
8614 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 break;
8619 default:
8620 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 reason, input, &exc,
8622 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008623 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008625 if (PyUnicode_READY(repunicode) < 0) {
8626 Py_DECREF(repunicode);
8627 goto onError;
8628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 repsize = PyUnicode_GET_LENGTH(repunicode);
8631 if (charmaptranslate_makespace(&output, &osize,
8632 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 Py_DECREF(repunicode);
8634 goto onError;
8635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 for (uni2 = 0; repsize-->0; ++uni2)
8637 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8638 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008640 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008641 }
8642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8644 if (!res)
8645 goto onError;
8646 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 Py_XDECREF(exc);
8648 Py_XDECREF(errorHandler);
8649 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 Py_XDECREF(exc);
8654 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 return NULL;
8656}
8657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658/* Deprecated. Use PyUnicode_Translate instead. */
8659PyObject *
8660PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8661 Py_ssize_t size,
8662 PyObject *mapping,
8663 const char *errors)
8664{
8665 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8666 if (!unicode)
8667 return NULL;
8668 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8669}
8670
Alexander Belopolsky40018472011-02-26 01:02:56 +00008671PyObject *
8672PyUnicode_Translate(PyObject *str,
8673 PyObject *mapping,
8674 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
8676 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008677
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 str = PyUnicode_FromObject(str);
8679 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 Py_DECREF(str);
8683 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008684
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 Py_XDECREF(str);
8687 return NULL;
8688}
Tim Petersced69f82003-09-16 20:30:58 +00008689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008691fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692{
8693 /* No need to call PyUnicode_READY(self) because this function is only
8694 called as a callback from fixup() which does it already. */
8695 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8696 const int kind = PyUnicode_KIND(self);
8697 void *data = PyUnicode_DATA(self);
8698 Py_UCS4 maxchar = 0, ch, fixed;
8699 Py_ssize_t i;
8700
8701 for (i = 0; i < len; ++i) {
8702 ch = PyUnicode_READ(kind, data, i);
8703 fixed = 0;
8704 if (ch > 127) {
8705 if (Py_UNICODE_ISSPACE(ch))
8706 fixed = ' ';
8707 else {
8708 const int decimal = Py_UNICODE_TODECIMAL(ch);
8709 if (decimal >= 0)
8710 fixed = '0' + decimal;
8711 }
8712 if (fixed != 0) {
8713 if (fixed > maxchar)
8714 maxchar = fixed;
8715 PyUnicode_WRITE(kind, data, i, fixed);
8716 }
8717 else if (ch > maxchar)
8718 maxchar = ch;
8719 }
8720 else if (ch > maxchar)
8721 maxchar = ch;
8722 }
8723
8724 return maxchar;
8725}
8726
8727PyObject *
8728_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8729{
8730 if (!PyUnicode_Check(unicode)) {
8731 PyErr_BadInternalCall();
8732 return NULL;
8733 }
8734 if (PyUnicode_READY(unicode) == -1)
8735 return NULL;
8736 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8737 /* If the string is already ASCII, just return the same string */
8738 Py_INCREF(unicode);
8739 return unicode;
8740 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008741 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742}
8743
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008744PyObject *
8745PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8746 Py_ssize_t length)
8747{
Victor Stinnerf0124502011-11-21 23:12:56 +01008748 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008749 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008750 Py_UCS4 maxchar;
8751 enum PyUnicode_Kind kind;
8752 void *data;
8753
8754 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008755 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008756 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008757 if (ch > 127) {
8758 int decimal = Py_UNICODE_TODECIMAL(ch);
8759 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008760 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008761 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008762 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008763 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008764
8765 /* Copy to a new string */
8766 decimal = PyUnicode_New(length, maxchar);
8767 if (decimal == NULL)
8768 return decimal;
8769 kind = PyUnicode_KIND(decimal);
8770 data = PyUnicode_DATA(decimal);
8771 /* Iterate over code points */
8772 for (i = 0; i < length; i++) {
8773 Py_UNICODE ch = s[i];
8774 if (ch > 127) {
8775 int decimal = Py_UNICODE_TODECIMAL(ch);
8776 if (decimal >= 0)
8777 ch = '0' + decimal;
8778 }
8779 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008781 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008782}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008783/* --- Decimal Encoder ---------------------------------------------------- */
8784
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785int
8786PyUnicode_EncodeDecimal(Py_UNICODE *s,
8787 Py_ssize_t length,
8788 char *output,
8789 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008790{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008791 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008792 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008793 enum PyUnicode_Kind kind;
8794 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008795
8796 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 PyErr_BadArgument();
8798 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008799 }
8800
Victor Stinner42bf7752011-11-21 22:52:58 +01008801 unicode = PyUnicode_FromUnicode(s, length);
8802 if (unicode == NULL)
8803 return -1;
8804
Victor Stinner6345be92011-11-25 20:09:01 +01008805 if (PyUnicode_READY(unicode) < 0) {
8806 Py_DECREF(unicode);
8807 return -1;
8808 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008809 kind = PyUnicode_KIND(unicode);
8810 data = PyUnicode_DATA(unicode);
8811
Victor Stinnerb84d7232011-11-22 01:50:07 +01008812 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008813 PyObject *exc;
8814 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008816 Py_ssize_t startpos;
8817
8818 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008819
Benjamin Peterson29060642009-01-31 22:14:21 +00008820 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008821 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008822 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008824 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 decimal = Py_UNICODE_TODECIMAL(ch);
8826 if (decimal >= 0) {
8827 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008828 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 continue;
8830 }
8831 if (0 < ch && ch < 256) {
8832 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008833 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 continue;
8835 }
Victor Stinner6345be92011-11-25 20:09:01 +01008836
Victor Stinner42bf7752011-11-21 22:52:58 +01008837 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008838 exc = NULL;
8839 raise_encode_exception(&exc, "decimal", unicode,
8840 startpos, startpos+1,
8841 "invalid decimal Unicode string");
8842 Py_XDECREF(exc);
8843 Py_DECREF(unicode);
8844 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008845 }
8846 /* 0-terminate the output string */
8847 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008848 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008849 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008850}
8851
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852/* --- Helpers ------------------------------------------------------------ */
8853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008855any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 Py_ssize_t start,
8857 Py_ssize_t end)
8858{
8859 int kind1, kind2, kind;
8860 void *buf1, *buf2;
8861 Py_ssize_t len1, len2, result;
8862
8863 kind1 = PyUnicode_KIND(s1);
8864 kind2 = PyUnicode_KIND(s2);
8865 kind = kind1 > kind2 ? kind1 : kind2;
8866 buf1 = PyUnicode_DATA(s1);
8867 buf2 = PyUnicode_DATA(s2);
8868 if (kind1 != kind)
8869 buf1 = _PyUnicode_AsKind(s1, kind);
8870 if (!buf1)
8871 return -2;
8872 if (kind2 != kind)
8873 buf2 = _PyUnicode_AsKind(s2, kind);
8874 if (!buf2) {
8875 if (kind1 != kind) PyMem_Free(buf1);
8876 return -2;
8877 }
8878 len1 = PyUnicode_GET_LENGTH(s1);
8879 len2 = PyUnicode_GET_LENGTH(s2);
8880
Victor Stinner794d5672011-10-10 03:21:36 +02008881 if (direction > 0) {
8882 switch(kind) {
8883 case PyUnicode_1BYTE_KIND:
8884 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8885 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8886 else
8887 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8888 break;
8889 case PyUnicode_2BYTE_KIND:
8890 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8891 break;
8892 case PyUnicode_4BYTE_KIND:
8893 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8894 break;
8895 default:
8896 assert(0); result = -2;
8897 }
8898 }
8899 else {
8900 switch(kind) {
8901 case PyUnicode_1BYTE_KIND:
8902 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8903 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8904 else
8905 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8906 break;
8907 case PyUnicode_2BYTE_KIND:
8908 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8909 break;
8910 case PyUnicode_4BYTE_KIND:
8911 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8912 break;
8913 default:
8914 assert(0); result = -2;
8915 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 }
8917
8918 if (kind1 != kind)
8919 PyMem_Free(buf1);
8920 if (kind2 != kind)
8921 PyMem_Free(buf2);
8922
8923 return result;
8924}
8925
8926Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008927_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 Py_ssize_t n_buffer,
8929 void *digits, Py_ssize_t n_digits,
8930 Py_ssize_t min_width,
8931 const char *grouping,
8932 const char *thousands_sep)
8933{
8934 switch(kind) {
8935 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008936 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8937 return _PyUnicode_ascii_InsertThousandsGrouping(
8938 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8939 min_width, grouping, thousands_sep);
8940 else
8941 return _PyUnicode_ucs1_InsertThousandsGrouping(
8942 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8943 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 case PyUnicode_2BYTE_KIND:
8945 return _PyUnicode_ucs2_InsertThousandsGrouping(
8946 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8947 min_width, grouping, thousands_sep);
8948 case PyUnicode_4BYTE_KIND:
8949 return _PyUnicode_ucs4_InsertThousandsGrouping(
8950 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8951 min_width, grouping, thousands_sep);
8952 }
8953 assert(0);
8954 return -1;
8955}
8956
8957
Thomas Wouters477c8d52006-05-27 19:21:47 +00008958/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008959#define ADJUST_INDICES(start, end, len) \
8960 if (end > len) \
8961 end = len; \
8962 else if (end < 0) { \
8963 end += len; \
8964 if (end < 0) \
8965 end = 0; \
8966 } \
8967 if (start < 0) { \
8968 start += len; \
8969 if (start < 0) \
8970 start = 0; \
8971 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008972
Alexander Belopolsky40018472011-02-26 01:02:56 +00008973Py_ssize_t
8974PyUnicode_Count(PyObject *str,
8975 PyObject *substr,
8976 Py_ssize_t start,
8977 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008979 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008980 PyObject* str_obj;
8981 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 int kind1, kind2, kind;
8983 void *buf1 = NULL, *buf2 = NULL;
8984 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008985
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008986 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008989 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008990 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 Py_DECREF(str_obj);
8992 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
Tim Petersced69f82003-09-16 20:30:58 +00008994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 kind1 = PyUnicode_KIND(str_obj);
8996 kind2 = PyUnicode_KIND(sub_obj);
8997 kind = kind1 > kind2 ? kind1 : kind2;
8998 buf1 = PyUnicode_DATA(str_obj);
8999 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009000 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 if (!buf1)
9002 goto onError;
9003 buf2 = PyUnicode_DATA(sub_obj);
9004 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009005 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 if (!buf2)
9007 goto onError;
9008 len1 = PyUnicode_GET_LENGTH(str_obj);
9009 len2 = PyUnicode_GET_LENGTH(sub_obj);
9010
9011 ADJUST_INDICES(start, end, len1);
9012 switch(kind) {
9013 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009014 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9015 result = asciilib_count(
9016 ((Py_UCS1*)buf1) + start, end - start,
9017 buf2, len2, PY_SSIZE_T_MAX
9018 );
9019 else
9020 result = ucs1lib_count(
9021 ((Py_UCS1*)buf1) + start, end - start,
9022 buf2, len2, PY_SSIZE_T_MAX
9023 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 break;
9025 case PyUnicode_2BYTE_KIND:
9026 result = ucs2lib_count(
9027 ((Py_UCS2*)buf1) + start, end - start,
9028 buf2, len2, PY_SSIZE_T_MAX
9029 );
9030 break;
9031 case PyUnicode_4BYTE_KIND:
9032 result = ucs4lib_count(
9033 ((Py_UCS4*)buf1) + start, end - start,
9034 buf2, len2, PY_SSIZE_T_MAX
9035 );
9036 break;
9037 default:
9038 assert(0); result = 0;
9039 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009040
9041 Py_DECREF(sub_obj);
9042 Py_DECREF(str_obj);
9043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 if (kind1 != kind)
9045 PyMem_Free(buf1);
9046 if (kind2 != kind)
9047 PyMem_Free(buf2);
9048
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 onError:
9051 Py_DECREF(sub_obj);
9052 Py_DECREF(str_obj);
9053 if (kind1 != kind && buf1)
9054 PyMem_Free(buf1);
9055 if (kind2 != kind && buf2)
9056 PyMem_Free(buf2);
9057 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058}
9059
Alexander Belopolsky40018472011-02-26 01:02:56 +00009060Py_ssize_t
9061PyUnicode_Find(PyObject *str,
9062 PyObject *sub,
9063 Py_ssize_t start,
9064 Py_ssize_t end,
9065 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009067 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009068
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009071 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009072 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 Py_DECREF(str);
9075 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 }
Tim Petersced69f82003-09-16 20:30:58 +00009077
Victor Stinner794d5672011-10-10 03:21:36 +02009078 result = any_find_slice(direction,
9079 str, sub, start, end
9080 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009081
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009083 Py_DECREF(sub);
9084
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 return result;
9086}
9087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088Py_ssize_t
9089PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9090 Py_ssize_t start, Py_ssize_t end,
9091 int direction)
9092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009094 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 if (PyUnicode_READY(str) == -1)
9096 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009097 if (start < 0 || end < 0) {
9098 PyErr_SetString(PyExc_IndexError, "string index out of range");
9099 return -2;
9100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 if (end > PyUnicode_GET_LENGTH(str))
9102 end = PyUnicode_GET_LENGTH(str);
9103 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009104 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9105 kind, end-start, ch, direction);
9106 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009108 else
9109 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110}
9111
Alexander Belopolsky40018472011-02-26 01:02:56 +00009112static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009113tailmatch(PyObject *self,
9114 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009115 Py_ssize_t start,
9116 Py_ssize_t end,
9117 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 int kind_self;
9120 int kind_sub;
9121 void *data_self;
9122 void *data_sub;
9123 Py_ssize_t offset;
9124 Py_ssize_t i;
9125 Py_ssize_t end_sub;
9126
9127 if (PyUnicode_READY(self) == -1 ||
9128 PyUnicode_READY(substring) == -1)
9129 return 0;
9130
9131 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 return 1;
9133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9135 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 kind_self = PyUnicode_KIND(self);
9140 data_self = PyUnicode_DATA(self);
9141 kind_sub = PyUnicode_KIND(substring);
9142 data_sub = PyUnicode_DATA(substring);
9143 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9144
9145 if (direction > 0)
9146 offset = end;
9147 else
9148 offset = start;
9149
9150 if (PyUnicode_READ(kind_self, data_self, offset) ==
9151 PyUnicode_READ(kind_sub, data_sub, 0) &&
9152 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9153 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9154 /* If both are of the same kind, memcmp is sufficient */
9155 if (kind_self == kind_sub) {
9156 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009157 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 data_sub,
9159 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009160 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 }
9162 /* otherwise we have to compare each character by first accesing it */
9163 else {
9164 /* We do not need to compare 0 and len(substring)-1 because
9165 the if statement above ensured already that they are equal
9166 when we end up here. */
9167 // TODO: honor direction and do a forward or backwards search
9168 for (i = 1; i < end_sub; ++i) {
9169 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9170 PyUnicode_READ(kind_sub, data_sub, i))
9171 return 0;
9172 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175 }
9176
9177 return 0;
9178}
9179
Alexander Belopolsky40018472011-02-26 01:02:56 +00009180Py_ssize_t
9181PyUnicode_Tailmatch(PyObject *str,
9182 PyObject *substr,
9183 Py_ssize_t start,
9184 Py_ssize_t end,
9185 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009187 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009188
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189 str = PyUnicode_FromObject(str);
9190 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 substr = PyUnicode_FromObject(substr);
9193 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 Py_DECREF(str);
9195 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 }
Tim Petersced69f82003-09-16 20:30:58 +00009197
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009198 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009200 Py_DECREF(str);
9201 Py_DECREF(substr);
9202 return result;
9203}
9204
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205/* Apply fixfct filter to the Unicode object self and return a
9206 reference to the modified object */
9207
Alexander Belopolsky40018472011-02-26 01:02:56 +00009208static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009209fixup(PyObject *self,
9210 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 PyObject *u;
9213 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214
Victor Stinner87af4f22011-11-21 23:03:47 +01009215 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009218 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 /* fix functions return the new maximum character in a string,
9221 if the kind of the resulting unicode object does not change,
9222 everything is fine. Otherwise we need to change the string kind
9223 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009224 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 if (maxchar_new == 0)
9226 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9227 else if (maxchar_new <= 127)
9228 maxchar_new = 127;
9229 else if (maxchar_new <= 255)
9230 maxchar_new = 255;
9231 else if (maxchar_new <= 65535)
9232 maxchar_new = 65535;
9233 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009234 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235
9236 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 /* fixfct should return TRUE if it modified the buffer. If
9238 FALSE, return a reference to the original buffer instead
9239 (to save space, not time) */
9240 Py_INCREF(self);
9241 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009242 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 else if (maxchar_new == maxchar_old) {
9245 return u;
9246 }
9247 else {
9248 /* In case the maximum character changed, we need to
9249 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009250 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 if (v == NULL) {
9252 Py_DECREF(u);
9253 return NULL;
9254 }
9255 if (maxchar_new > maxchar_old) {
9256 /* If the maxchar increased so that the kind changed, not all
9257 characters are representable anymore and we need to fix the
9258 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009259 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009260 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9262 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009263 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009264 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266
9267 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009268 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 return v;
9270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271}
9272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009274fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 /* No need to call PyUnicode_READY(self) because this function is only
9277 called as a callback from fixup() which does it already. */
9278 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9279 const int kind = PyUnicode_KIND(self);
9280 void *data = PyUnicode_DATA(self);
9281 int touched = 0;
9282 Py_UCS4 maxchar = 0;
9283 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 for (i = 0; i < len; ++i) {
9286 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9287 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9288 if (up != ch) {
9289 if (up > maxchar)
9290 maxchar = up;
9291 PyUnicode_WRITE(kind, data, i, up);
9292 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 else if (ch > maxchar)
9295 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 }
9297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 if (touched)
9299 return maxchar;
9300 else
9301 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302}
9303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009305fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9308 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9309 const int kind = PyUnicode_KIND(self);
9310 void *data = PyUnicode_DATA(self);
9311 int touched = 0;
9312 Py_UCS4 maxchar = 0;
9313 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 for(i = 0; i < len; ++i) {
9316 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9317 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9318 if (lo != ch) {
9319 if (lo > maxchar)
9320 maxchar = lo;
9321 PyUnicode_WRITE(kind, data, i, lo);
9322 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 else if (ch > maxchar)
9325 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326 }
9327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 if (touched)
9329 return maxchar;
9330 else
9331 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332}
9333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009335fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9338 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9339 const int kind = PyUnicode_KIND(self);
9340 void *data = PyUnicode_DATA(self);
9341 int touched = 0;
9342 Py_UCS4 maxchar = 0;
9343 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 for(i = 0; i < len; ++i) {
9346 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9347 Py_UCS4 nu = 0;
9348
9349 if (Py_UNICODE_ISUPPER(ch))
9350 nu = Py_UNICODE_TOLOWER(ch);
9351 else if (Py_UNICODE_ISLOWER(ch))
9352 nu = Py_UNICODE_TOUPPER(ch);
9353
9354 if (nu != 0) {
9355 if (nu > maxchar)
9356 maxchar = nu;
9357 PyUnicode_WRITE(kind, data, i, nu);
9358 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 else if (ch > maxchar)
9361 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 }
9363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 if (touched)
9365 return maxchar;
9366 else
9367 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368}
9369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009371fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9374 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9375 const int kind = PyUnicode_KIND(self);
9376 void *data = PyUnicode_DATA(self);
9377 int touched = 0;
9378 Py_UCS4 maxchar = 0;
9379 Py_ssize_t i = 0;
9380 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009381
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009382 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009383 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384
9385 ch = PyUnicode_READ(kind, data, i);
9386 if (!Py_UNICODE_ISUPPER(ch)) {
9387 maxchar = Py_UNICODE_TOUPPER(ch);
9388 PyUnicode_WRITE(kind, data, i, maxchar);
9389 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 ++i;
9392 for(; i < len; ++i) {
9393 ch = PyUnicode_READ(kind, data, i);
9394 if (!Py_UNICODE_ISLOWER(ch)) {
9395 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9396 if (lo > maxchar)
9397 maxchar = lo;
9398 PyUnicode_WRITE(kind, data, i, lo);
9399 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 else if (ch > maxchar)
9402 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404
9405 if (touched)
9406 return maxchar;
9407 else
9408 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409}
9410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009412fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9415 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9416 const int kind = PyUnicode_KIND(self);
9417 void *data = PyUnicode_DATA(self);
9418 Py_UCS4 maxchar = 0;
9419 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 int previous_is_cased;
9421
9422 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 if (len == 1) {
9424 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9425 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9426 if (ti != ch) {
9427 PyUnicode_WRITE(kind, data, i, ti);
9428 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 }
9430 else
9431 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 for(; i < len; ++i) {
9435 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9436 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009437
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 nu = Py_UNICODE_TOTITLE(ch);
9442
9443 if (nu > maxchar)
9444 maxchar = nu;
9445 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009446
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 if (Py_UNICODE_ISLOWER(ch) ||
9448 Py_UNICODE_ISUPPER(ch) ||
9449 Py_UNICODE_ISTITLE(ch))
9450 previous_is_cased = 1;
9451 else
9452 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455}
9456
Tim Peters8ce9f162004-08-27 01:49:32 +00009457PyObject *
9458PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009461 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009463 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009464 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9465 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009466 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009468 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009470 int use_memcpy;
9471 unsigned char *res_data = NULL, *sep_data = NULL;
9472 PyObject *last_obj;
9473 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474
Tim Peters05eba1f2004-08-27 21:32:02 +00009475 fseq = PySequence_Fast(seq, "");
9476 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009477 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009478 }
9479
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009480 /* NOTE: the following code can't call back into Python code,
9481 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009482 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009483
Tim Peters05eba1f2004-08-27 21:32:02 +00009484 seqlen = PySequence_Fast_GET_SIZE(fseq);
9485 /* If empty sequence, return u"". */
9486 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009487 Py_DECREF(fseq);
9488 Py_INCREF(unicode_empty);
9489 res = unicode_empty;
9490 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009491 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009492
Tim Peters05eba1f2004-08-27 21:32:02 +00009493 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009494 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009496 if (seqlen == 1) {
9497 if (PyUnicode_CheckExact(items[0])) {
9498 res = items[0];
9499 Py_INCREF(res);
9500 Py_DECREF(fseq);
9501 return res;
9502 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009503 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009504 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009505 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009506 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009507 /* Set up sep and seplen */
9508 if (separator == NULL) {
9509 /* fall back to a blank space separator */
9510 sep = PyUnicode_FromOrdinal(' ');
9511 if (!sep)
9512 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009513 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009514 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009515 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009516 else {
9517 if (!PyUnicode_Check(separator)) {
9518 PyErr_Format(PyExc_TypeError,
9519 "separator: expected str instance,"
9520 " %.80s found",
9521 Py_TYPE(separator)->tp_name);
9522 goto onError;
9523 }
9524 if (PyUnicode_READY(separator))
9525 goto onError;
9526 sep = separator;
9527 seplen = PyUnicode_GET_LENGTH(separator);
9528 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9529 /* inc refcount to keep this code path symmetric with the
9530 above case of a blank separator */
9531 Py_INCREF(sep);
9532 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009533 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009534 }
9535
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009536 /* There are at least two things to join, or else we have a subclass
9537 * of str in the sequence.
9538 * Do a pre-pass to figure out the total amount of space we'll
9539 * need (sz), and see whether all argument are strings.
9540 */
9541 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009542#ifdef Py_DEBUG
9543 use_memcpy = 0;
9544#else
9545 use_memcpy = 1;
9546#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009547 for (i = 0; i < seqlen; i++) {
9548 const Py_ssize_t old_sz = sz;
9549 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (!PyUnicode_Check(item)) {
9551 PyErr_Format(PyExc_TypeError,
9552 "sequence item %zd: expected str instance,"
9553 " %.80s found",
9554 i, Py_TYPE(item)->tp_name);
9555 goto onError;
9556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 if (PyUnicode_READY(item) == -1)
9558 goto onError;
9559 sz += PyUnicode_GET_LENGTH(item);
9560 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009561 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009562 if (i != 0)
9563 sz += seplen;
9564 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9565 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009567 goto onError;
9568 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009569 if (use_memcpy && last_obj != NULL) {
9570 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9571 use_memcpy = 0;
9572 }
9573 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 }
Tim Petersced69f82003-09-16 20:30:58 +00009575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009577 if (res == NULL)
9578 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009579
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009581#ifdef Py_DEBUG
9582 use_memcpy = 0;
9583#else
9584 if (use_memcpy) {
9585 res_data = PyUnicode_1BYTE_DATA(res);
9586 kind = PyUnicode_KIND(res);
9587 if (seplen != 0)
9588 sep_data = PyUnicode_1BYTE_DATA(sep);
9589 }
9590#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009592 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009593 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009595 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 if (use_memcpy) {
9597 Py_MEMCPY(res_data,
9598 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009599 kind * seplen);
9600 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009601 }
9602 else {
9603 copy_characters(res, res_offset, sep, 0, seplen);
9604 res_offset += seplen;
9605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009607 itemlen = PyUnicode_GET_LENGTH(item);
9608 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009609 if (use_memcpy) {
9610 Py_MEMCPY(res_data,
9611 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009612 kind * itemlen);
9613 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 }
9615 else {
9616 copy_characters(res, res_offset, item, 0, itemlen);
9617 res_offset += itemlen;
9618 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009619 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009620 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 if (use_memcpy)
9622 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009623 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009624 else
9625 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009626
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009629 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009633 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009635 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 return NULL;
9637}
9638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639#define FILL(kind, data, value, start, length) \
9640 do { \
9641 Py_ssize_t i_ = 0; \
9642 assert(kind != PyUnicode_WCHAR_KIND); \
9643 switch ((kind)) { \
9644 case PyUnicode_1BYTE_KIND: { \
9645 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9646 memset(to_, (unsigned char)value, length); \
9647 break; \
9648 } \
9649 case PyUnicode_2BYTE_KIND: { \
9650 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9651 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9652 break; \
9653 } \
9654 default: { \
9655 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9656 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9657 break; \
9658 } \
9659 } \
9660 } while (0)
9661
Victor Stinner9310abb2011-10-05 00:59:23 +02009662static PyObject *
9663pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664 Py_ssize_t left,
9665 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 PyObject *u;
9669 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009670 int kind;
9671 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672
9673 if (left < 0)
9674 left = 0;
9675 if (right < 0)
9676 right = 0;
9677
Tim Peters7a29bd52001-09-12 03:03:31 +00009678 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679 Py_INCREF(self);
9680 return self;
9681 }
9682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9684 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009685 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9686 return NULL;
9687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9689 if (fill > maxchar)
9690 maxchar = fill;
9691 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009692 if (!u)
9693 return NULL;
9694
9695 kind = PyUnicode_KIND(u);
9696 data = PyUnicode_DATA(u);
9697 if (left)
9698 FILL(kind, data, fill, 0, left);
9699 if (right)
9700 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009701 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009702 assert(_PyUnicode_CheckConsistency(u, 1));
9703 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706
Alexander Belopolsky40018472011-02-26 01:02:56 +00009707PyObject *
9708PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
9712 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 switch(PyUnicode_KIND(string)) {
9717 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009718 if (PyUnicode_IS_ASCII(string))
9719 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009720 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009721 PyUnicode_GET_LENGTH(string), keepends);
9722 else
9723 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009724 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009725 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 break;
9727 case PyUnicode_2BYTE_KIND:
9728 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009729 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 PyUnicode_GET_LENGTH(string), keepends);
9731 break;
9732 case PyUnicode_4BYTE_KIND:
9733 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 PyUnicode_GET_LENGTH(string), keepends);
9736 break;
9737 default:
9738 assert(0);
9739 list = 0;
9740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741 Py_DECREF(string);
9742 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743}
9744
Alexander Belopolsky40018472011-02-26 01:02:56 +00009745static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009746split(PyObject *self,
9747 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009748 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 int kind1, kind2, kind;
9751 void *buf1, *buf2;
9752 Py_ssize_t len1, len2;
9753 PyObject* out;
9754
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009756 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (PyUnicode_READY(self) == -1)
9759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 if (substring == NULL)
9762 switch(PyUnicode_KIND(self)) {
9763 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 if (PyUnicode_IS_ASCII(self))
9765 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009766 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009767 PyUnicode_GET_LENGTH(self), maxcount
9768 );
9769 else
9770 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 PyUnicode_GET_LENGTH(self), maxcount
9773 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 case PyUnicode_2BYTE_KIND:
9775 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 PyUnicode_GET_LENGTH(self), maxcount
9778 );
9779 case PyUnicode_4BYTE_KIND:
9780 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(self), maxcount
9783 );
9784 default:
9785 assert(0);
9786 return NULL;
9787 }
9788
9789 if (PyUnicode_READY(substring) == -1)
9790 return NULL;
9791
9792 kind1 = PyUnicode_KIND(self);
9793 kind2 = PyUnicode_KIND(substring);
9794 kind = kind1 > kind2 ? kind1 : kind2;
9795 buf1 = PyUnicode_DATA(self);
9796 buf2 = PyUnicode_DATA(substring);
9797 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009798 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 if (!buf1)
9800 return NULL;
9801 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009802 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 if (!buf2) {
9804 if (kind1 != kind) PyMem_Free(buf1);
9805 return NULL;
9806 }
9807 len1 = PyUnicode_GET_LENGTH(self);
9808 len2 = PyUnicode_GET_LENGTH(substring);
9809
9810 switch(kind) {
9811 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009812 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9813 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009814 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009815 else
9816 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 break;
9819 case PyUnicode_2BYTE_KIND:
9820 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009821 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 break;
9823 case PyUnicode_4BYTE_KIND:
9824 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 break;
9827 default:
9828 out = NULL;
9829 }
9830 if (kind1 != kind)
9831 PyMem_Free(buf1);
9832 if (kind2 != kind)
9833 PyMem_Free(buf2);
9834 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835}
9836
Alexander Belopolsky40018472011-02-26 01:02:56 +00009837static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009838rsplit(PyObject *self,
9839 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009840 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 int kind1, kind2, kind;
9843 void *buf1, *buf2;
9844 Py_ssize_t len1, len2;
9845 PyObject* out;
9846
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009847 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009848 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 if (PyUnicode_READY(self) == -1)
9851 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (substring == NULL)
9854 switch(PyUnicode_KIND(self)) {
9855 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009856 if (PyUnicode_IS_ASCII(self))
9857 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009858 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009859 PyUnicode_GET_LENGTH(self), maxcount
9860 );
9861 else
9862 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009863 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 PyUnicode_GET_LENGTH(self), maxcount
9865 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 case PyUnicode_2BYTE_KIND:
9867 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 PyUnicode_GET_LENGTH(self), maxcount
9870 );
9871 case PyUnicode_4BYTE_KIND:
9872 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 PyUnicode_GET_LENGTH(self), maxcount
9875 );
9876 default:
9877 assert(0);
9878 return NULL;
9879 }
9880
9881 if (PyUnicode_READY(substring) == -1)
9882 return NULL;
9883
9884 kind1 = PyUnicode_KIND(self);
9885 kind2 = PyUnicode_KIND(substring);
9886 kind = kind1 > kind2 ? kind1 : kind2;
9887 buf1 = PyUnicode_DATA(self);
9888 buf2 = PyUnicode_DATA(substring);
9889 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 if (!buf1)
9892 return NULL;
9893 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009894 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 if (!buf2) {
9896 if (kind1 != kind) PyMem_Free(buf1);
9897 return NULL;
9898 }
9899 len1 = PyUnicode_GET_LENGTH(self);
9900 len2 = PyUnicode_GET_LENGTH(substring);
9901
9902 switch(kind) {
9903 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009904 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9905 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009906 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009907 else
9908 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 break;
9911 case PyUnicode_2BYTE_KIND:
9912 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 break;
9915 case PyUnicode_4BYTE_KIND:
9916 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 break;
9919 default:
9920 out = NULL;
9921 }
9922 if (kind1 != kind)
9923 PyMem_Free(buf1);
9924 if (kind2 != kind)
9925 PyMem_Free(buf2);
9926 return out;
9927}
9928
9929static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009930anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9931 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932{
9933 switch(kind) {
9934 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009935 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9936 return asciilib_find(buf1, len1, buf2, len2, offset);
9937 else
9938 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 case PyUnicode_2BYTE_KIND:
9940 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9941 case PyUnicode_4BYTE_KIND:
9942 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9943 }
9944 assert(0);
9945 return -1;
9946}
9947
9948static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9950 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951{
9952 switch(kind) {
9953 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009954 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9955 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9956 else
9957 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 case PyUnicode_2BYTE_KIND:
9959 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9960 case PyUnicode_4BYTE_KIND:
9961 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9962 }
9963 assert(0);
9964 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009965}
9966
Alexander Belopolsky40018472011-02-26 01:02:56 +00009967static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968replace(PyObject *self, PyObject *str1,
9969 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 PyObject *u;
9972 char *sbuf = PyUnicode_DATA(self);
9973 char *buf1 = PyUnicode_DATA(str1);
9974 char *buf2 = PyUnicode_DATA(str2);
9975 int srelease = 0, release1 = 0, release2 = 0;
9976 int skind = PyUnicode_KIND(self);
9977 int kind1 = PyUnicode_KIND(str1);
9978 int kind2 = PyUnicode_KIND(str2);
9979 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9980 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9981 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009982 int mayshrink;
9983 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
9985 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009988 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989
Victor Stinner59de0ee2011-10-07 10:01:28 +02009990 if (str1 == str2)
9991 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 if (skind < kind1)
9993 /* substring too wide to be present */
9994 goto nothing;
9995
Victor Stinner49a0a212011-10-12 23:46:10 +02009996 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9997 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9998 /* Replacing str1 with str2 may cause a maxchar reduction in the
9999 result string. */
10000 mayshrink = (maxchar_str2 < maxchar);
10001 maxchar = Py_MAX(maxchar, maxchar_str2);
10002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010004 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010007 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010009 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010010 Py_UCS4 u1, u2;
10011 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010013 if (findchar(sbuf, PyUnicode_KIND(self),
10014 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010015 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010018 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010020 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 rkind = PyUnicode_KIND(u);
10022 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10023 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010024 if (--maxcount < 0)
10025 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010027 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010028 }
10029 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 int rkind = skind;
10031 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 if (kind1 < rkind) {
10034 /* widen substring */
10035 buf1 = _PyUnicode_AsKind(str1, rkind);
10036 if (!buf1) goto error;
10037 release1 = 1;
10038 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010039 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010040 if (i < 0)
10041 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (rkind > kind2) {
10043 /* widen replacement */
10044 buf2 = _PyUnicode_AsKind(str2, rkind);
10045 if (!buf2) goto error;
10046 release2 = 1;
10047 }
10048 else if (rkind < kind2) {
10049 /* widen self and buf1 */
10050 rkind = kind2;
10051 if (release1) PyMem_Free(buf1);
10052 sbuf = _PyUnicode_AsKind(self, rkind);
10053 if (!sbuf) goto error;
10054 srelease = 1;
10055 buf1 = _PyUnicode_AsKind(str1, rkind);
10056 if (!buf1) goto error;
10057 release1 = 1;
10058 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 u = PyUnicode_New(slen, maxchar);
10060 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010062 assert(PyUnicode_KIND(u) == rkind);
10063 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010064
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010065 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010066 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010067 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010069 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010071
10072 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010073 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010075 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010076 if (i == -1)
10077 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010078 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010080 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010084 }
10085 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 Py_ssize_t n, i, j, ires;
10087 Py_ssize_t product, new_size;
10088 int rkind = skind;
10089 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010092 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 buf1 = _PyUnicode_AsKind(str1, rkind);
10094 if (!buf1) goto error;
10095 release1 = 1;
10096 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010097 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010098 if (n == 0)
10099 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010101 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 buf2 = _PyUnicode_AsKind(str2, rkind);
10103 if (!buf2) goto error;
10104 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010107 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 rkind = kind2;
10109 sbuf = _PyUnicode_AsKind(self, rkind);
10110 if (!sbuf) goto error;
10111 srelease = 1;
10112 if (release1) PyMem_Free(buf1);
10113 buf1 = _PyUnicode_AsKind(str1, rkind);
10114 if (!buf1) goto error;
10115 release1 = 1;
10116 }
10117 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10118 PyUnicode_GET_LENGTH(str1))); */
10119 product = n * (len2-len1);
10120 if ((product / (len2-len1)) != n) {
10121 PyErr_SetString(PyExc_OverflowError,
10122 "replace string is too long");
10123 goto error;
10124 }
10125 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 if (new_size == 0) {
10127 Py_INCREF(unicode_empty);
10128 u = unicode_empty;
10129 goto done;
10130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10132 PyErr_SetString(PyExc_OverflowError,
10133 "replace string is too long");
10134 goto error;
10135 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010136 u = PyUnicode_New(new_size, maxchar);
10137 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010139 assert(PyUnicode_KIND(u) == rkind);
10140 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 ires = i = 0;
10142 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010143 while (n-- > 0) {
10144 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010145 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010146 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010147 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010148 if (j == -1)
10149 break;
10150 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010151 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010152 memcpy(res + rkind * ires,
10153 sbuf + rkind * i,
10154 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010156 }
10157 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010159 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010161 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010168 memcpy(res + rkind * ires,
10169 sbuf + rkind * i,
10170 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010171 }
10172 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 /* interleave */
10174 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010175 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010177 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 if (--n <= 0)
10180 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010181 memcpy(res + rkind * ires,
10182 sbuf + rkind * i,
10183 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 ires++;
10185 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010186 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 memcpy(res + rkind * ires,
10188 sbuf + rkind * i,
10189 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010190 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010191 }
10192
10193 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010194 unicode_adjust_maxchar(&u);
10195 if (u == NULL)
10196 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010197 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010198
10199 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (srelease)
10201 PyMem_FREE(sbuf);
10202 if (release1)
10203 PyMem_FREE(buf1);
10204 if (release2)
10205 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010206 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010208
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (srelease)
10212 PyMem_FREE(sbuf);
10213 if (release1)
10214 PyMem_FREE(buf1);
10215 if (release2)
10216 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010217 if (PyUnicode_CheckExact(self)) {
10218 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010221 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 error:
10223 if (srelease && sbuf)
10224 PyMem_FREE(sbuf);
10225 if (release1 && buf1)
10226 PyMem_FREE(buf1);
10227 if (release2 && buf2)
10228 PyMem_FREE(buf2);
10229 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230}
10231
10232/* --- Unicode Object Methods --------------------------------------------- */
10233
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010234PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010235 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236\n\
10237Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010238characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
10240static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010241unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 return fixup(self, fixtitle);
10244}
10245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010246PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010247 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248\n\
10249Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010250have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
10252static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010253unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 return fixup(self, fixcapitalize);
10256}
10257
10258#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010259PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261\n\
10262Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010263normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
10265static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010266unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267{
10268 PyObject *list;
10269 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010270 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 /* Split into words */
10273 list = split(self, NULL, -1);
10274 if (!list)
10275 return NULL;
10276
10277 /* Capitalize each word */
10278 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010279 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281 if (item == NULL)
10282 goto onError;
10283 Py_DECREF(PyList_GET_ITEM(list, i));
10284 PyList_SET_ITEM(list, i, item);
10285 }
10286
10287 /* Join the words to form a new string */
10288 item = PyUnicode_Join(NULL, list);
10289
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010292 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293}
10294#endif
10295
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010296/* Argument converter. Coerces to a single unicode character */
10297
10298static int
10299convert_uc(PyObject *obj, void *addr)
10300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010302 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010303
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 uniobj = PyUnicode_FromObject(obj);
10305 if (uniobj == NULL) {
10306 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010308 return 0;
10309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010311 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010313 Py_DECREF(uniobj);
10314 return 0;
10315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 Py_DECREF(uniobj);
10318 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010319}
10320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010321PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010322 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010324Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010325done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326
10327static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010328unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010330 Py_ssize_t marg, left;
10331 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 Py_UCS4 fillchar = ' ';
10333
Victor Stinnere9a29352011-10-01 02:14:59 +020010334 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336
Victor Stinnere9a29352011-10-01 02:14:59 +020010337 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338 return NULL;
10339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010341 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343 }
10344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346 left = marg / 2 + (marg & width & 1);
10347
Victor Stinner9310abb2011-10-05 00:59:23 +020010348 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349}
10350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351/* This function assumes that str1 and str2 are readied by the caller. */
10352
Marc-André Lemburge5034372000-08-08 08:04:29 +000010353static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010354unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 int kind1, kind2;
10357 void *data1, *data2;
10358 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 kind1 = PyUnicode_KIND(str1);
10361 kind2 = PyUnicode_KIND(str2);
10362 data1 = PyUnicode_DATA(str1);
10363 data2 = PyUnicode_DATA(str2);
10364 len1 = PyUnicode_GET_LENGTH(str1);
10365 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 for (i = 0; i < len1 && i < len2; ++i) {
10368 Py_UCS4 c1, c2;
10369 c1 = PyUnicode_READ(kind1, data1, i);
10370 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010371
10372 if (c1 != c2)
10373 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010374 }
10375
10376 return (len1 < len2) ? -1 : (len1 != len2);
10377}
10378
Alexander Belopolsky40018472011-02-26 01:02:56 +000010379int
10380PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10383 if (PyUnicode_READY(left) == -1 ||
10384 PyUnicode_READY(right) == -1)
10385 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010386 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010388 PyErr_Format(PyExc_TypeError,
10389 "Can't compare %.100s and %.100s",
10390 left->ob_type->tp_name,
10391 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return -1;
10393}
10394
Martin v. Löwis5b222132007-06-10 09:51:05 +000010395int
10396PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 Py_ssize_t i;
10399 int kind;
10400 void *data;
10401 Py_UCS4 chr;
10402
Victor Stinner910337b2011-10-03 03:20:16 +020010403 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 if (PyUnicode_READY(uni) == -1)
10405 return -1;
10406 kind = PyUnicode_KIND(uni);
10407 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010408 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10410 if (chr != str[i])
10411 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010412 /* This check keeps Python strings that end in '\0' from comparing equal
10413 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010416 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010418 return 0;
10419}
10420
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010421
Benjamin Peterson29060642009-01-31 22:14:21 +000010422#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010423 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010424
Alexander Belopolsky40018472011-02-26 01:02:56 +000010425PyObject *
10426PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010427{
10428 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010429
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010430 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10431 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (PyUnicode_READY(left) == -1 ||
10433 PyUnicode_READY(right) == -1)
10434 return NULL;
10435 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10436 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010437 if (op == Py_EQ) {
10438 Py_INCREF(Py_False);
10439 return Py_False;
10440 }
10441 if (op == Py_NE) {
10442 Py_INCREF(Py_True);
10443 return Py_True;
10444 }
10445 }
10446 if (left == right)
10447 result = 0;
10448 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010449 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010450
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010451 /* Convert the return value to a Boolean */
10452 switch (op) {
10453 case Py_EQ:
10454 v = TEST_COND(result == 0);
10455 break;
10456 case Py_NE:
10457 v = TEST_COND(result != 0);
10458 break;
10459 case Py_LE:
10460 v = TEST_COND(result <= 0);
10461 break;
10462 case Py_GE:
10463 v = TEST_COND(result >= 0);
10464 break;
10465 case Py_LT:
10466 v = TEST_COND(result == -1);
10467 break;
10468 case Py_GT:
10469 v = TEST_COND(result == 1);
10470 break;
10471 default:
10472 PyErr_BadArgument();
10473 return NULL;
10474 }
10475 Py_INCREF(v);
10476 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478
Brian Curtindfc80e32011-08-10 20:28:54 -050010479 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010480}
10481
Alexander Belopolsky40018472011-02-26 01:02:56 +000010482int
10483PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010484{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 int kind1, kind2, kind;
10487 void *buf1, *buf2;
10488 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010489 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010490
10491 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 sub = PyUnicode_FromObject(element);
10493 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 PyErr_Format(PyExc_TypeError,
10495 "'in <string>' requires string as left operand, not %s",
10496 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010497 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (PyUnicode_READY(sub) == -1)
10500 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010501
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010503 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 Py_DECREF(sub);
10505 return -1;
10506 }
10507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 kind1 = PyUnicode_KIND(str);
10509 kind2 = PyUnicode_KIND(sub);
10510 kind = kind1 > kind2 ? kind1 : kind2;
10511 buf1 = PyUnicode_DATA(str);
10512 buf2 = PyUnicode_DATA(sub);
10513 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010514 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (!buf1) {
10516 Py_DECREF(sub);
10517 return -1;
10518 }
10519 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010520 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (!buf2) {
10522 Py_DECREF(sub);
10523 if (kind1 != kind) PyMem_Free(buf1);
10524 return -1;
10525 }
10526 len1 = PyUnicode_GET_LENGTH(str);
10527 len2 = PyUnicode_GET_LENGTH(sub);
10528
10529 switch(kind) {
10530 case PyUnicode_1BYTE_KIND:
10531 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10532 break;
10533 case PyUnicode_2BYTE_KIND:
10534 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10535 break;
10536 case PyUnicode_4BYTE_KIND:
10537 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10538 break;
10539 default:
10540 result = -1;
10541 assert(0);
10542 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010543
10544 Py_DECREF(str);
10545 Py_DECREF(sub);
10546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (kind1 != kind)
10548 PyMem_Free(buf1);
10549 if (kind2 != kind)
10550 PyMem_Free(buf2);
10551
Guido van Rossum403d68b2000-03-13 15:55:09 +000010552 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010553}
10554
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555/* Concat to string or Unicode object giving a new Unicode object. */
10556
Alexander Belopolsky40018472011-02-26 01:02:56 +000010557PyObject *
10558PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010561 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562
10563 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
10571 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010572 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010575 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010576 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010577 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579 }
10580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010582 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10583 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 w = PyUnicode_New(
10587 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10588 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010590 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010591 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10592 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 Py_DECREF(u);
10594 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010595 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 Py_XDECREF(u);
10600 Py_XDECREF(v);
10601 return NULL;
10602}
10603
Victor Stinnerb0923652011-10-04 01:17:31 +020010604static void
10605unicode_append_inplace(PyObject **p_left, PyObject *right)
10606{
10607 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010608
10609 assert(PyUnicode_IS_READY(*p_left));
10610 assert(PyUnicode_IS_READY(right));
10611
10612 left_len = PyUnicode_GET_LENGTH(*p_left);
10613 right_len = PyUnicode_GET_LENGTH(right);
10614 if (left_len > PY_SSIZE_T_MAX - right_len) {
10615 PyErr_SetString(PyExc_OverflowError,
10616 "strings are too large to concat");
10617 goto error;
10618 }
10619 new_len = left_len + right_len;
10620
10621 /* Now we own the last reference to 'left', so we can resize it
10622 * in-place.
10623 */
10624 if (unicode_resize(p_left, new_len) != 0) {
10625 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10626 * deallocated so it cannot be put back into
10627 * 'variable'. The MemoryError is raised when there
10628 * is no value in 'variable', which might (very
10629 * remotely) be a cause of incompatibilities.
10630 */
10631 goto error;
10632 }
10633 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010634 copy_characters(*p_left, left_len, right, 0, right_len);
10635 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010636 return;
10637
10638error:
10639 Py_DECREF(*p_left);
10640 *p_left = NULL;
10641}
10642
Walter Dörwald1ab83302007-05-18 17:15:44 +000010643void
Victor Stinner23e56682011-10-03 03:54:37 +020010644PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010645{
Victor Stinner23e56682011-10-03 03:54:37 +020010646 PyObject *left, *res;
10647
10648 if (p_left == NULL) {
10649 if (!PyErr_Occurred())
10650 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010651 return;
10652 }
Victor Stinner23e56682011-10-03 03:54:37 +020010653 left = *p_left;
10654 if (right == NULL || !PyUnicode_Check(left)) {
10655 if (!PyErr_Occurred())
10656 PyErr_BadInternalCall();
10657 goto error;
10658 }
10659
Victor Stinnere1335c72011-10-04 20:53:03 +020010660 if (PyUnicode_READY(left))
10661 goto error;
10662 if (PyUnicode_READY(right))
10663 goto error;
10664
Victor Stinner23e56682011-10-03 03:54:37 +020010665 if (PyUnicode_CheckExact(left) && left != unicode_empty
10666 && PyUnicode_CheckExact(right) && right != unicode_empty
10667 && unicode_resizable(left)
10668 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10669 || _PyUnicode_WSTR(left) != NULL))
10670 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010671 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10672 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010673 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010674 not so different than duplicating the string. */
10675 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010676 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010677 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010678 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010679 return;
10680 }
10681 }
10682
10683 res = PyUnicode_Concat(left, right);
10684 if (res == NULL)
10685 goto error;
10686 Py_DECREF(left);
10687 *p_left = res;
10688 return;
10689
10690error:
10691 Py_DECREF(*p_left);
10692 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010693}
10694
10695void
10696PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10697{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010698 PyUnicode_Append(pleft, right);
10699 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010700}
10701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010702PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010703 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010706string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
10709static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010710unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010712 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010713 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010714 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 int kind1, kind2, kind;
10717 void *buf1, *buf2;
10718 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
Jesus Ceaac451502011-04-20 17:09:23 +020010720 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10721 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 kind1 = PyUnicode_KIND(self);
10725 kind2 = PyUnicode_KIND(substring);
10726 kind = kind1 > kind2 ? kind1 : kind2;
10727 buf1 = PyUnicode_DATA(self);
10728 buf2 = PyUnicode_DATA(substring);
10729 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010730 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!buf1) {
10732 Py_DECREF(substring);
10733 return NULL;
10734 }
10735 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010736 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (!buf2) {
10738 Py_DECREF(substring);
10739 if (kind1 != kind) PyMem_Free(buf1);
10740 return NULL;
10741 }
10742 len1 = PyUnicode_GET_LENGTH(self);
10743 len2 = PyUnicode_GET_LENGTH(substring);
10744
10745 ADJUST_INDICES(start, end, len1);
10746 switch(kind) {
10747 case PyUnicode_1BYTE_KIND:
10748 iresult = ucs1lib_count(
10749 ((Py_UCS1*)buf1) + start, end - start,
10750 buf2, len2, PY_SSIZE_T_MAX
10751 );
10752 break;
10753 case PyUnicode_2BYTE_KIND:
10754 iresult = ucs2lib_count(
10755 ((Py_UCS2*)buf1) + start, end - start,
10756 buf2, len2, PY_SSIZE_T_MAX
10757 );
10758 break;
10759 case PyUnicode_4BYTE_KIND:
10760 iresult = ucs4lib_count(
10761 ((Py_UCS4*)buf1) + start, end - start,
10762 buf2, len2, PY_SSIZE_T_MAX
10763 );
10764 break;
10765 default:
10766 assert(0); iresult = 0;
10767 }
10768
10769 result = PyLong_FromSsize_t(iresult);
10770
10771 if (kind1 != kind)
10772 PyMem_Free(buf1);
10773 if (kind2 != kind)
10774 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775
10776 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010777
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 return result;
10779}
10780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010781PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010782 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010784Encode S using the codec registered for encoding. Default encoding\n\
10785is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010786handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010787a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10788'xmlcharrefreplace' as well as any other name registered with\n\
10789codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790
10791static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010792unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010794 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 char *encoding = NULL;
10796 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010797
Benjamin Peterson308d6372009-09-18 21:42:35 +000010798 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10799 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010801 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010802}
10803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010804PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010805 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806\n\
10807Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809
10810static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010811unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010813 Py_ssize_t i, j, line_pos, src_len, incr;
10814 Py_UCS4 ch;
10815 PyObject *u;
10816 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010818 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010819 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823
Antoine Pitrou22425222011-10-04 19:10:51 +020010824 if (PyUnicode_READY(self) == -1)
10825 return NULL;
10826
Thomas Wouters7e474022000-07-16 12:04:32 +000010827 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010828 src_len = PyUnicode_GET_LENGTH(self);
10829 i = j = line_pos = 0;
10830 kind = PyUnicode_KIND(self);
10831 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010832 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010833 for (; i < src_len; i++) {
10834 ch = PyUnicode_READ(kind, src_data, i);
10835 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010836 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010837 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010838 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010839 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010840 goto overflow;
10841 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010842 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010843 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010846 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 goto overflow;
10848 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010850 if (ch == '\n' || ch == '\r')
10851 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010853 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010854 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010855 Py_INCREF(self);
10856 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010857 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010858
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010860 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 if (!u)
10862 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010863 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
Antoine Pitroue71d5742011-10-04 15:55:09 +020010865 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 for (; i < src_len; i++) {
10868 ch = PyUnicode_READ(kind, src_data, i);
10869 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010871 incr = tabsize - (line_pos % tabsize);
10872 line_pos += incr;
10873 while (incr--) {
10874 PyUnicode_WRITE(kind, dest_data, j, ' ');
10875 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010876 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010878 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010880 line_pos++;
10881 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010882 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 if (ch == '\n' || ch == '\r')
10884 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010886 }
10887 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010888 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010889
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010891 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10892 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893}
10894
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010895PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897\n\
10898Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010899such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900arguments start and end are interpreted as in slice notation.\n\
10901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010902Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903
10904static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010907 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010908 Py_ssize_t start;
10909 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
Jesus Ceaac451502011-04-20 17:09:23 +020010912 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10913 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
10918 if (PyUnicode_READY(substring) == -1)
10919 return NULL;
10920
Victor Stinner7931d9a2011-11-04 00:22:48 +010010921 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
10923 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (result == -2)
10926 return NULL;
10927
Christian Heimes217cfd12007-12-02 14:31:20 +000010928 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
10931static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010932unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010934 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10935 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
10939
Guido van Rossumc2504932007-09-18 19:42:40 +000010940/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010941 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010942static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010943unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944{
Guido van Rossumc2504932007-09-18 19:42:40 +000010945 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010946 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 if (_PyUnicode_HASH(self) != -1)
10949 return _PyUnicode_HASH(self);
10950 if (PyUnicode_READY(self) == -1)
10951 return -1;
10952 len = PyUnicode_GET_LENGTH(self);
10953
10954 /* The hash function as a macro, gets expanded three times below. */
10955#define HASH(P) \
10956 x = (Py_uhash_t)*P << 7; \
10957 while (--len >= 0) \
10958 x = (1000003*x) ^ (Py_uhash_t)*P++;
10959
10960 switch (PyUnicode_KIND(self)) {
10961 case PyUnicode_1BYTE_KIND: {
10962 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10963 HASH(c);
10964 break;
10965 }
10966 case PyUnicode_2BYTE_KIND: {
10967 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10968 HASH(s);
10969 break;
10970 }
10971 default: {
10972 Py_UCS4 *l;
10973 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10974 "Impossible switch case in unicode_hash");
10975 l = PyUnicode_4BYTE_DATA(self);
10976 HASH(l);
10977 break;
10978 }
10979 }
10980 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10981
Guido van Rossumc2504932007-09-18 19:42:40 +000010982 if (x == -1)
10983 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010985 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010989PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010992Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
10994static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010997 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010998 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010999 Py_ssize_t start;
11000 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
Jesus Ceaac451502011-04-20 17:09:23 +020011002 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11003 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (PyUnicode_READY(self) == -1)
11007 return NULL;
11008 if (PyUnicode_READY(substring) == -1)
11009 return NULL;
11010
Victor Stinner7931d9a2011-11-04 00:22:48 +010011011 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
11013 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 if (result == -2)
11016 return NULL;
11017
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 if (result < 0) {
11019 PyErr_SetString(PyExc_ValueError, "substring not found");
11020 return NULL;
11021 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011022
Christian Heimes217cfd12007-12-02 14:31:20 +000011023 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024}
11025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011026PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011029Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011030at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
11032static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011033unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 Py_ssize_t i, length;
11036 int kind;
11037 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 int cased;
11039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 if (PyUnicode_READY(self) == -1)
11041 return NULL;
11042 length = PyUnicode_GET_LENGTH(self);
11043 kind = PyUnicode_KIND(self);
11044 data = PyUnicode_DATA(self);
11045
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (length == 1)
11048 return PyBool_FromLong(
11049 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011051 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011054
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 for (i = 0; i < length; i++) {
11057 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011058
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11060 return PyBool_FromLong(0);
11061 else if (!cased && Py_UNICODE_ISLOWER(ch))
11062 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011064 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065}
11066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011067PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011068 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011070Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011071at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
11073static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 Py_ssize_t i, length;
11077 int kind;
11078 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079 int cased;
11080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (PyUnicode_READY(self) == -1)
11082 return NULL;
11083 length = PyUnicode_GET_LENGTH(self);
11084 kind = PyUnicode_KIND(self);
11085 data = PyUnicode_DATA(self);
11086
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 if (length == 1)
11089 return PyBool_FromLong(
11090 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011092 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011095
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 for (i = 0; i < length; i++) {
11098 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011099
Benjamin Peterson29060642009-01-31 22:14:21 +000011100 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11101 return PyBool_FromLong(0);
11102 else if (!cased && Py_UNICODE_ISUPPER(ch))
11103 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011105 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106}
11107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011109 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011111Return True if S is a titlecased string and there is at least one\n\
11112character in S, i.e. upper- and titlecase characters may only\n\
11113follow uncased characters and lowercase characters only cased ones.\n\
11114Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115
11116static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011117unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 Py_ssize_t i, length;
11120 int kind;
11121 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 int cased, previous_is_cased;
11123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 if (PyUnicode_READY(self) == -1)
11125 return NULL;
11126 length = PyUnicode_GET_LENGTH(self);
11127 kind = PyUnicode_KIND(self);
11128 data = PyUnicode_DATA(self);
11129
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 if (length == 1) {
11132 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11133 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11134 (Py_UNICODE_ISUPPER(ch) != 0));
11135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141 cased = 0;
11142 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 for (i = 0; i < length; i++) {
11144 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011145
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11147 if (previous_is_cased)
11148 return PyBool_FromLong(0);
11149 previous_is_cased = 1;
11150 cased = 1;
11151 }
11152 else if (Py_UNICODE_ISLOWER(ch)) {
11153 if (!previous_is_cased)
11154 return PyBool_FromLong(0);
11155 previous_is_cased = 1;
11156 cased = 1;
11157 }
11158 else
11159 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011161 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162}
11163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011164PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011167Return True if all characters in S are whitespace\n\
11168and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011171unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 Py_ssize_t i, length;
11174 int kind;
11175 void *data;
11176
11177 if (PyUnicode_READY(self) == -1)
11178 return NULL;
11179 length = PyUnicode_GET_LENGTH(self);
11180 kind = PyUnicode_KIND(self);
11181 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 if (length == 1)
11185 return PyBool_FromLong(
11186 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011188 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011190 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 for (i = 0; i < length; i++) {
11193 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011194 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011197 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198}
11199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011200PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011202\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011203Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011204and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011205
11206static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011207unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 Py_ssize_t i, length;
11210 int kind;
11211 void *data;
11212
11213 if (PyUnicode_READY(self) == -1)
11214 return NULL;
11215 length = PyUnicode_GET_LENGTH(self);
11216 kind = PyUnicode_KIND(self);
11217 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011218
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011219 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 if (length == 1)
11221 return PyBool_FromLong(
11222 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223
11224 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 for (i = 0; i < length; i++) {
11229 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011231 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011232 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011233}
11234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011237\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011238Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011239and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011240
11241static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011242unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 int kind;
11245 void *data;
11246 Py_ssize_t len, i;
11247
11248 if (PyUnicode_READY(self) == -1)
11249 return NULL;
11250
11251 kind = PyUnicode_KIND(self);
11252 data = PyUnicode_DATA(self);
11253 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011254
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011255 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 if (len == 1) {
11257 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11258 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11259 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011260
11261 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 for (i = 0; i < len; i++) {
11266 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011267 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011269 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011270 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271}
11272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011273PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011276Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
11279static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011280unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 Py_ssize_t i, length;
11283 int kind;
11284 void *data;
11285
11286 if (PyUnicode_READY(self) == -1)
11287 return NULL;
11288 length = PyUnicode_GET_LENGTH(self);
11289 kind = PyUnicode_KIND(self);
11290 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 if (length == 1)
11294 return PyBool_FromLong(
11295 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011297 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 for (i = 0; i < length; i++) {
11302 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011305 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306}
11307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011311Return True if all characters in S are digits\n\
11312and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011315unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 Py_ssize_t i, length;
11318 int kind;
11319 void *data;
11320
11321 if (PyUnicode_READY(self) == -1)
11322 return NULL;
11323 length = PyUnicode_GET_LENGTH(self);
11324 kind = PyUnicode_KIND(self);
11325 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 if (length == 1) {
11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11330 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011333 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 for (i = 0; i < length; i++) {
11338 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011341 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342}
11343
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011344PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011345 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011347Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011348False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
11350static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011351unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 Py_ssize_t i, length;
11354 int kind;
11355 void *data;
11356
11357 if (PyUnicode_READY(self) == -1)
11358 return NULL;
11359 length = PyUnicode_GET_LENGTH(self);
11360 kind = PyUnicode_KIND(self);
11361 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (length == 1)
11365 return PyBool_FromLong(
11366 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011368 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 for (i = 0; i < length; i++) {
11373 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011374 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011376 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377}
11378
Martin v. Löwis47383402007-08-15 07:32:56 +000011379int
11380PyUnicode_IsIdentifier(PyObject *self)
11381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 int kind;
11383 void *data;
11384 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011385 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (PyUnicode_READY(self) == -1) {
11388 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 }
11391
11392 /* Special case for empty strings */
11393 if (PyUnicode_GET_LENGTH(self) == 0)
11394 return 0;
11395 kind = PyUnicode_KIND(self);
11396 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011397
11398 /* PEP 3131 says that the first character must be in
11399 XID_Start and subsequent characters in XID_Continue,
11400 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011402 letters, digits, underscore). However, given the current
11403 definition of XID_Start and XID_Continue, it is sufficient
11404 to check just for these, except that _ must be allowed
11405 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011407 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011408 return 0;
11409
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011410 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011413 return 1;
11414}
11415
11416PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011418\n\
11419Return True if S is a valid identifier according\n\
11420to the language definition.");
11421
11422static PyObject*
11423unicode_isidentifier(PyObject *self)
11424{
11425 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11426}
11427
Georg Brandl559e5d72008-06-11 18:37:52 +000011428PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011429 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011430\n\
11431Return True if all characters in S are considered\n\
11432printable in repr() or S is empty, False otherwise.");
11433
11434static PyObject*
11435unicode_isprintable(PyObject *self)
11436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 Py_ssize_t i, length;
11438 int kind;
11439 void *data;
11440
11441 if (PyUnicode_READY(self) == -1)
11442 return NULL;
11443 length = PyUnicode_GET_LENGTH(self);
11444 kind = PyUnicode_KIND(self);
11445 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011446
11447 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 if (length == 1)
11449 return PyBool_FromLong(
11450 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 for (i = 0; i < length; i++) {
11453 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011454 Py_RETURN_FALSE;
11455 }
11456 }
11457 Py_RETURN_TRUE;
11458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011461 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
11463Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011464iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011467unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011469 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470}
11471
Martin v. Löwis18e16552006-02-15 17:27:45 +000011472static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011473unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 if (PyUnicode_READY(self) == -1)
11476 return -1;
11477 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011483Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011484done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
11486static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011487unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011489 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 Py_UCS4 fillchar = ' ';
11491
11492 if (PyUnicode_READY(self) == -1)
11493 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011494
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011495 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 return NULL;
11497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011500 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 }
11502
Victor Stinner7931d9a2011-11-04 00:22:48 +010011503 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504}
11505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011506PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011512unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 return fixup(self, fixlower);
11515}
11516
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011517#define LEFTSTRIP 0
11518#define RIGHTSTRIP 1
11519#define BOTHSTRIP 2
11520
11521/* Arrays indexed by above */
11522static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11523
11524#define STRIPNAME(i) (stripformat[i]+3)
11525
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011526/* externally visible for str.strip(unicode) */
11527PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011528_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 void *data;
11531 int kind;
11532 Py_ssize_t i, j, len;
11533 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11536 return NULL;
11537
11538 kind = PyUnicode_KIND(self);
11539 data = PyUnicode_DATA(self);
11540 len = PyUnicode_GET_LENGTH(self);
11541 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11542 PyUnicode_DATA(sepobj),
11543 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544
Benjamin Peterson14339b62009-01-31 16:36:08 +000011545 i = 0;
11546 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 while (i < len &&
11548 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 i++;
11550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011551 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011552
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 j = len;
11554 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 do {
11556 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 } while (j >= i &&
11558 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011560 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011561
Victor Stinner7931d9a2011-11-04 00:22:48 +010011562 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563}
11564
11565PyObject*
11566PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11567{
11568 unsigned char *data;
11569 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011570 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571
Victor Stinnerde636f32011-10-01 03:55:54 +020011572 if (PyUnicode_READY(self) == -1)
11573 return NULL;
11574
11575 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11576
Victor Stinner12bab6d2011-10-01 01:53:49 +020011577 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011579 if (PyUnicode_CheckExact(self)) {
11580 Py_INCREF(self);
11581 return self;
11582 }
11583 else
11584 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 }
11586
Victor Stinner12bab6d2011-10-01 01:53:49 +020011587 length = end - start;
11588 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011589 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590
Victor Stinnerde636f32011-10-01 03:55:54 +020011591 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011592 PyErr_SetString(PyExc_IndexError, "string index out of range");
11593 return NULL;
11594 }
11595
Victor Stinnerb9275c12011-10-05 14:01:42 +020011596 if (PyUnicode_IS_ASCII(self)) {
11597 kind = PyUnicode_KIND(self);
11598 data = PyUnicode_1BYTE_DATA(self);
11599 return unicode_fromascii(data + start, length);
11600 }
11601 else {
11602 kind = PyUnicode_KIND(self);
11603 data = PyUnicode_1BYTE_DATA(self);
11604 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011605 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011606 length);
11607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
11610static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011611do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 int kind;
11614 void *data;
11615 Py_ssize_t len, i, j;
11616
11617 if (PyUnicode_READY(self) == -1)
11618 return NULL;
11619
11620 kind = PyUnicode_KIND(self);
11621 data = PyUnicode_DATA(self);
11622 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011623
Benjamin Peterson14339b62009-01-31 16:36:08 +000011624 i = 0;
11625 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011627 i++;
11628 }
11629 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 j = len;
11632 if (striptype != LEFTSTRIP) {
11633 do {
11634 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011636 j++;
11637 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011638
Victor Stinner7931d9a2011-11-04 00:22:48 +010011639 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640}
11641
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642
11643static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011644do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011647
Benjamin Peterson14339b62009-01-31 16:36:08 +000011648 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11649 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650
Benjamin Peterson14339b62009-01-31 16:36:08 +000011651 if (sep != NULL && sep != Py_None) {
11652 if (PyUnicode_Check(sep))
11653 return _PyUnicode_XStrip(self, striptype, sep);
11654 else {
11655 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 "%s arg must be None or str",
11657 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011658 return NULL;
11659 }
11660 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663}
11664
11665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011667 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668\n\
11669Return a copy of the string S with leading and trailing\n\
11670whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011671If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672
11673static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011674unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 if (PyTuple_GET_SIZE(args) == 0)
11677 return do_strip(self, BOTHSTRIP); /* Common case */
11678 else
11679 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680}
11681
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011685\n\
11686Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011687If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011688
11689static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011690unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011692 if (PyTuple_GET_SIZE(args) == 0)
11693 return do_strip(self, LEFTSTRIP); /* Common case */
11694 else
11695 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696}
11697
11698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701\n\
11702Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011703If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
11705static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011706unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011708 if (PyTuple_GET_SIZE(args) == 0)
11709 return do_strip(self, RIGHTSTRIP); /* Common case */
11710 else
11711 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712}
11713
11714
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011716unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
Georg Brandl222de0f2009-04-12 12:01:50 +000011721 if (len < 1) {
11722 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011723 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Tim Peters7a29bd52001-09-12 03:03:31 +000011726 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 /* no repeat, return original string */
11728 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011729 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730 }
Tim Peters8f422462000-09-09 06:13:41 +000011731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 if (PyUnicode_READY(str) == -1)
11733 return NULL;
11734
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011735 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011736 PyErr_SetString(PyExc_OverflowError,
11737 "repeated string is too long");
11738 return NULL;
11739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011741
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011742 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 if (!u)
11744 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011745 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (PyUnicode_GET_LENGTH(str) == 1) {
11748 const int kind = PyUnicode_KIND(str);
11749 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11750 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011751 if (kind == PyUnicode_1BYTE_KIND)
11752 memset(to, (unsigned char)fill_char, len);
11753 else {
11754 for (n = 0; n < len; ++n)
11755 PyUnicode_WRITE(kind, to, n, fill_char);
11756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 }
11758 else {
11759 /* number of characters copied this far */
11760 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011761 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 char *to = (char *) PyUnicode_DATA(u);
11763 Py_MEMCPY(to, PyUnicode_DATA(str),
11764 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 n = (done <= nchars-done) ? done : nchars-done;
11767 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011768 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 }
11771
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011772 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011773 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774}
11775
Alexander Belopolsky40018472011-02-26 01:02:56 +000011776PyObject *
11777PyUnicode_Replace(PyObject *obj,
11778 PyObject *subobj,
11779 PyObject *replobj,
11780 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
11782 PyObject *self;
11783 PyObject *str1;
11784 PyObject *str2;
11785 PyObject *result;
11786
11787 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011788 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011791 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 Py_DECREF(self);
11793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 }
11795 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011796 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 Py_DECREF(self);
11798 Py_DECREF(str1);
11799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802 Py_DECREF(self);
11803 Py_DECREF(str1);
11804 Py_DECREF(str2);
11805 return result;
11806}
11807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011808PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011809 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810\n\
11811Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011812old replaced by new. If the optional argument count is\n\
11813given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814
11815static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 PyObject *str1;
11819 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011820 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 PyObject *result;
11822
Martin v. Löwis18e16552006-02-15 17:27:45 +000011823 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 str1 = PyUnicode_FromObject(str1);
11828 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11829 return NULL;
11830 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011831 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 Py_DECREF(str1);
11833 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835
11836 result = replace(self, str1, str2, maxcount);
11837
11838 Py_DECREF(str1);
11839 Py_DECREF(str2);
11840 return result;
11841}
11842
Alexander Belopolsky40018472011-02-26 01:02:56 +000011843static PyObject *
11844unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011846 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 Py_ssize_t isize;
11848 Py_ssize_t osize, squote, dquote, i, o;
11849 Py_UCS4 max, quote;
11850 int ikind, okind;
11851 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011854 return NULL;
11855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 isize = PyUnicode_GET_LENGTH(unicode);
11857 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 /* Compute length of output, quote characters, and
11860 maximum character */
11861 osize = 2; /* quotes */
11862 max = 127;
11863 squote = dquote = 0;
11864 ikind = PyUnicode_KIND(unicode);
11865 for (i = 0; i < isize; i++) {
11866 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11867 switch (ch) {
11868 case '\'': squote++; osize++; break;
11869 case '"': dquote++; osize++; break;
11870 case '\\': case '\t': case '\r': case '\n':
11871 osize += 2; break;
11872 default:
11873 /* Fast-path ASCII */
11874 if (ch < ' ' || ch == 0x7f)
11875 osize += 4; /* \xHH */
11876 else if (ch < 0x7f)
11877 osize++;
11878 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11879 osize++;
11880 max = ch > max ? ch : max;
11881 }
11882 else if (ch < 0x100)
11883 osize += 4; /* \xHH */
11884 else if (ch < 0x10000)
11885 osize += 6; /* \uHHHH */
11886 else
11887 osize += 10; /* \uHHHHHHHH */
11888 }
11889 }
11890
11891 quote = '\'';
11892 if (squote) {
11893 if (dquote)
11894 /* Both squote and dquote present. Use squote,
11895 and escape them */
11896 osize += squote;
11897 else
11898 quote = '"';
11899 }
11900
11901 repr = PyUnicode_New(osize, max);
11902 if (repr == NULL)
11903 return NULL;
11904 okind = PyUnicode_KIND(repr);
11905 odata = PyUnicode_DATA(repr);
11906
11907 PyUnicode_WRITE(okind, odata, 0, quote);
11908 PyUnicode_WRITE(okind, odata, osize-1, quote);
11909
11910 for (i = 0, o = 1; i < isize; i++) {
11911 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011912
11913 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if ((ch == quote) || (ch == '\\')) {
11915 PyUnicode_WRITE(okind, odata, o++, '\\');
11916 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011917 continue;
11918 }
11919
Benjamin Peterson29060642009-01-31 22:14:21 +000011920 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011921 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 PyUnicode_WRITE(okind, odata, o++, '\\');
11923 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011924 }
11925 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 PyUnicode_WRITE(okind, odata, o++, '\\');
11927 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011928 }
11929 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 PyUnicode_WRITE(okind, odata, o++, '\\');
11931 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932 }
11933
11934 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011935 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 PyUnicode_WRITE(okind, odata, o++, '\\');
11937 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011940 }
11941
Georg Brandl559e5d72008-06-11 18:37:52 +000011942 /* Copy ASCII characters as-is */
11943 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011945 }
11946
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011948 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011949 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011950 (categories Z* and C* except ASCII space)
11951 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011953 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (ch <= 0xff) {
11955 PyUnicode_WRITE(okind, odata, o++, '\\');
11956 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011959 }
11960 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 else if (ch >= 0x10000) {
11962 PyUnicode_WRITE(okind, odata, o++, '\\');
11963 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011964 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11965 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11966 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11967 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11968 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11969 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11970 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11971 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011972 }
11973 /* Map 16-bit characters to '\uxxxx' */
11974 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 PyUnicode_WRITE(okind, odata, o++, '\\');
11976 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011977 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11978 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11979 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11980 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011981 }
11982 }
11983 /* Copy characters as-is */
11984 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011986 }
11987 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011990 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011991 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996\n\
11997Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011998such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999arguments start and end are interpreted as in slice notation.\n\
12000\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012001Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
12003static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012006 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012007 Py_ssize_t start;
12008 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012009 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
Jesus Ceaac451502011-04-20 17:09:23 +020012011 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12012 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (PyUnicode_READY(self) == -1)
12016 return NULL;
12017 if (PyUnicode_READY(substring) == -1)
12018 return NULL;
12019
Victor Stinner7931d9a2011-11-04 00:22:48 +010012020 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
12022 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (result == -2)
12025 return NULL;
12026
Christian Heimes217cfd12007-12-02 14:31:20 +000012027 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028}
12029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012030PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012031 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012033Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034
12035static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012038 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012039 Py_ssize_t start;
12040 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012041 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Jesus Ceaac451502011-04-20 17:09:23 +020012043 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12044 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 if (PyUnicode_READY(self) == -1)
12048 return NULL;
12049 if (PyUnicode_READY(substring) == -1)
12050 return NULL;
12051
Victor Stinner7931d9a2011-11-04 00:22:48 +010012052 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
12054 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 if (result == -2)
12057 return NULL;
12058
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059 if (result < 0) {
12060 PyErr_SetString(PyExc_ValueError, "substring not found");
12061 return NULL;
12062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063
Christian Heimes217cfd12007-12-02 14:31:20 +000012064 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065}
12066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012067PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012068 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012070Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012071done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
12073static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012074unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012076 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 Py_UCS4 fillchar = ' ';
12078
Victor Stinnere9a29352011-10-01 02:14:59 +020012079 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012081
Victor Stinnere9a29352011-10-01 02:14:59 +020012082 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083 return NULL;
12084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012087 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088 }
12089
Victor Stinner7931d9a2011-11-04 00:22:48 +010012090 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091}
12092
Alexander Belopolsky40018472011-02-26 01:02:56 +000012093PyObject *
12094PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095{
12096 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012097
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098 s = PyUnicode_FromObject(s);
12099 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012100 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 if (sep != NULL) {
12102 sep = PyUnicode_FromObject(sep);
12103 if (sep == NULL) {
12104 Py_DECREF(s);
12105 return NULL;
12106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 }
12108
Victor Stinner9310abb2011-10-05 00:59:23 +020012109 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
12111 Py_DECREF(s);
12112 Py_XDECREF(sep);
12113 return result;
12114}
12115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012116PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118\n\
12119Return a list of the words in S, using sep as the\n\
12120delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012121splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012122whitespace string is a separator and empty strings are\n\
12123removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124
12125static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012126unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127{
12128 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012129 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
Martin v. Löwis18e16552006-02-15 17:27:45 +000012131 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 return NULL;
12133
12134 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012137 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012139 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140}
12141
Thomas Wouters477c8d52006-05-27 19:21:47 +000012142PyObject *
12143PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12144{
12145 PyObject* str_obj;
12146 PyObject* sep_obj;
12147 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012148 int kind1, kind2, kind;
12149 void *buf1 = NULL, *buf2 = NULL;
12150 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012151
12152 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012153 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012155 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012157 Py_DECREF(str_obj);
12158 return NULL;
12159 }
12160
Victor Stinner14f8f022011-10-05 20:58:25 +020012161 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012163 kind = Py_MAX(kind1, kind2);
12164 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012166 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 if (!buf1)
12168 goto onError;
12169 buf2 = PyUnicode_DATA(sep_obj);
12170 if (kind2 != kind)
12171 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12172 if (!buf2)
12173 goto onError;
12174 len1 = PyUnicode_GET_LENGTH(str_obj);
12175 len2 = PyUnicode_GET_LENGTH(sep_obj);
12176
Victor Stinner14f8f022011-10-05 20:58:25 +020012177 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012179 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12180 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12181 else
12182 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 break;
12184 case PyUnicode_2BYTE_KIND:
12185 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12186 break;
12187 case PyUnicode_4BYTE_KIND:
12188 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12189 break;
12190 default:
12191 assert(0);
12192 out = 0;
12193 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012194
12195 Py_DECREF(sep_obj);
12196 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (kind1 != kind)
12198 PyMem_Free(buf1);
12199 if (kind2 != kind)
12200 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012201
12202 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 onError:
12204 Py_DECREF(sep_obj);
12205 Py_DECREF(str_obj);
12206 if (kind1 != kind && buf1)
12207 PyMem_Free(buf1);
12208 if (kind2 != kind && buf2)
12209 PyMem_Free(buf2);
12210 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211}
12212
12213
12214PyObject *
12215PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12216{
12217 PyObject* str_obj;
12218 PyObject* sep_obj;
12219 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 int kind1, kind2, kind;
12221 void *buf1 = NULL, *buf2 = NULL;
12222 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
12224 str_obj = PyUnicode_FromObject(str_in);
12225 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012226 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012227 sep_obj = PyUnicode_FromObject(sep_in);
12228 if (!sep_obj) {
12229 Py_DECREF(str_obj);
12230 return NULL;
12231 }
12232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 kind1 = PyUnicode_KIND(str_in);
12234 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012235 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 buf1 = PyUnicode_DATA(str_in);
12237 if (kind1 != kind)
12238 buf1 = _PyUnicode_AsKind(str_in, kind);
12239 if (!buf1)
12240 goto onError;
12241 buf2 = PyUnicode_DATA(sep_obj);
12242 if (kind2 != kind)
12243 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12244 if (!buf2)
12245 goto onError;
12246 len1 = PyUnicode_GET_LENGTH(str_obj);
12247 len2 = PyUnicode_GET_LENGTH(sep_obj);
12248
12249 switch(PyUnicode_KIND(str_in)) {
12250 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012251 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12252 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12253 else
12254 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 break;
12256 case PyUnicode_2BYTE_KIND:
12257 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12258 break;
12259 case PyUnicode_4BYTE_KIND:
12260 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12261 break;
12262 default:
12263 assert(0);
12264 out = 0;
12265 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266
12267 Py_DECREF(sep_obj);
12268 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 if (kind1 != kind)
12270 PyMem_Free(buf1);
12271 if (kind2 != kind)
12272 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273
12274 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012275 onError:
12276 Py_DECREF(sep_obj);
12277 Py_DECREF(str_obj);
12278 if (kind1 != kind && buf1)
12279 PyMem_Free(buf1);
12280 if (kind2 != kind && buf2)
12281 PyMem_Free(buf2);
12282 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012283}
12284
12285PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012288Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012290found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291
12292static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012293unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294{
Victor Stinner9310abb2011-10-05 00:59:23 +020012295 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296}
12297
12298PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012299 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012300\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012301Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012302the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012303separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304
12305static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012306unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012307{
Victor Stinner9310abb2011-10-05 00:59:23 +020012308 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309}
12310
Alexander Belopolsky40018472011-02-26 01:02:56 +000012311PyObject *
12312PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313{
12314 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012315
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316 s = PyUnicode_FromObject(s);
12317 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 if (sep != NULL) {
12320 sep = PyUnicode_FromObject(sep);
12321 if (sep == NULL) {
12322 Py_DECREF(s);
12323 return NULL;
12324 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012325 }
12326
Victor Stinner9310abb2011-10-05 00:59:23 +020012327 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012328
12329 Py_DECREF(s);
12330 Py_XDECREF(sep);
12331 return result;
12332}
12333
12334PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012336\n\
12337Return a list of the words in S, using sep as the\n\
12338delimiter string, starting at the end of the string and\n\
12339working to the front. If maxsplit is given, at most maxsplit\n\
12340splits are done. If sep is not specified, any whitespace string\n\
12341is a separator.");
12342
12343static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012344unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012345{
12346 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012347 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012348
Martin v. Löwis18e16552006-02-15 17:27:45 +000012349 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012350 return NULL;
12351
12352 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012354 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012355 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012356 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012357 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012358}
12359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012360PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362\n\
12363Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012364Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012365is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366
12367static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012368unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012370 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012371 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012373 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12374 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375 return NULL;
12376
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012377 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378}
12379
12380static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012381PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382{
Walter Dörwald346737f2007-05-31 10:44:43 +000012383 if (PyUnicode_CheckExact(self)) {
12384 Py_INCREF(self);
12385 return self;
12386 } else
12387 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012388 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012389}
12390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012391PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393\n\
12394Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012395and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396
12397static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012398unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012399{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 return fixup(self, fixswapcase);
12401}
12402
Georg Brandlceee0772007-11-27 23:48:05 +000012403PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012404 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012405\n\
12406Return a translation table usable for str.translate().\n\
12407If there is only one argument, it must be a dictionary mapping Unicode\n\
12408ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012409Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012410If there are two arguments, they must be strings of equal length, and\n\
12411in the resulting dictionary, each character in x will be mapped to the\n\
12412character at the same position in y. If there is a third argument, it\n\
12413must be a string, whose characters will be mapped to None in the result.");
12414
12415static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012416unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012417{
12418 PyObject *x, *y = NULL, *z = NULL;
12419 PyObject *new = NULL, *key, *value;
12420 Py_ssize_t i = 0;
12421 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422
Georg Brandlceee0772007-11-27 23:48:05 +000012423 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12424 return NULL;
12425 new = PyDict_New();
12426 if (!new)
12427 return NULL;
12428 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 int x_kind, y_kind, z_kind;
12430 void *x_data, *y_data, *z_data;
12431
Georg Brandlceee0772007-11-27 23:48:05 +000012432 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012433 if (!PyUnicode_Check(x)) {
12434 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12435 "be a string if there is a second argument");
12436 goto err;
12437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012439 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12440 "arguments must have equal length");
12441 goto err;
12442 }
12443 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 x_kind = PyUnicode_KIND(x);
12445 y_kind = PyUnicode_KIND(y);
12446 x_data = PyUnicode_DATA(x);
12447 y_data = PyUnicode_DATA(y);
12448 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12449 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12450 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012451 if (!key || !value)
12452 goto err;
12453 res = PyDict_SetItem(new, key, value);
12454 Py_DECREF(key);
12455 Py_DECREF(value);
12456 if (res < 0)
12457 goto err;
12458 }
12459 /* create entries for deleting chars in z */
12460 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 z_kind = PyUnicode_KIND(z);
12462 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012463 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012465 if (!key)
12466 goto err;
12467 res = PyDict_SetItem(new, key, Py_None);
12468 Py_DECREF(key);
12469 if (res < 0)
12470 goto err;
12471 }
12472 }
12473 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 int kind;
12475 void *data;
12476
Georg Brandlceee0772007-11-27 23:48:05 +000012477 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012478 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012479 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12480 "to maketrans it must be a dict");
12481 goto err;
12482 }
12483 /* copy entries into the new dict, converting string keys to int keys */
12484 while (PyDict_Next(x, &i, &key, &value)) {
12485 if (PyUnicode_Check(key)) {
12486 /* convert string keys to integer keys */
12487 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012488 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012489 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12490 "table must be of length 1");
12491 goto err;
12492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 kind = PyUnicode_KIND(key);
12494 data = PyUnicode_DATA(key);
12495 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012496 if (!newkey)
12497 goto err;
12498 res = PyDict_SetItem(new, newkey, value);
12499 Py_DECREF(newkey);
12500 if (res < 0)
12501 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012502 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012503 /* just keep integer keys */
12504 if (PyDict_SetItem(new, key, value) < 0)
12505 goto err;
12506 } else {
12507 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12508 "be strings or integers");
12509 goto err;
12510 }
12511 }
12512 }
12513 return new;
12514 err:
12515 Py_DECREF(new);
12516 return NULL;
12517}
12518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012519PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012520 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521\n\
12522Return a copy of the string S, where all characters have been mapped\n\
12523through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012524Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012525Unmapped characters are left untouched. Characters mapped to None\n\
12526are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
12528static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532}
12533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012540unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542 return fixup(self, fixupper);
12543}
12544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012545PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012546 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012548Pad a numeric string S with zeros on the left, to fill a field\n\
12549of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
12551static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012552unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012554 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012555 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012556 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 int kind;
12558 void *data;
12559 Py_UCS4 chr;
12560
12561 if (PyUnicode_READY(self) == -1)
12562 return NULL;
12563
Martin v. Löwis18e16552006-02-15 17:27:45 +000012564 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565 return NULL;
12566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012568 if (PyUnicode_CheckExact(self)) {
12569 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012570 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012571 }
12572 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012573 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574 }
12575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
12578 u = pad(self, fill, 0, '0');
12579
Walter Dörwald068325e2002-04-15 13:36:47 +000012580 if (u == NULL)
12581 return NULL;
12582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 kind = PyUnicode_KIND(u);
12584 data = PyUnicode_DATA(u);
12585 chr = PyUnicode_READ(kind, data, fill);
12586
12587 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 PyUnicode_WRITE(kind, data, 0, chr);
12590 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 }
12592
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012593 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012594 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
12597#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012598static PyObject *
12599unicode__decimal2ascii(PyObject *self)
12600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012602}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603#endif
12604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012605PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012608Return True if S starts with the specified prefix, False otherwise.\n\
12609With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012610With optional end, stop comparing S at that position.\n\
12611prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
12613static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012614unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012617 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012618 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012619 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012620 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012621 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622
Jesus Ceaac451502011-04-20 17:09:23 +020012623 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012625 if (PyTuple_Check(subobj)) {
12626 Py_ssize_t i;
12627 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012629 if (substring == NULL)
12630 return NULL;
12631 result = tailmatch(self, substring, start, end, -1);
12632 Py_DECREF(substring);
12633 if (result) {
12634 Py_RETURN_TRUE;
12635 }
12636 }
12637 /* nothing matched */
12638 Py_RETURN_FALSE;
12639 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012640 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012641 if (substring == NULL) {
12642 if (PyErr_ExceptionMatches(PyExc_TypeError))
12643 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12644 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012646 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012647 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012649 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012650}
12651
12652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012653PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012656Return True if S ends with the specified suffix, False otherwise.\n\
12657With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012658With optional end, stop comparing S at that position.\n\
12659suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660
12661static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012665 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012667 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012668 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012669 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670
Jesus Ceaac451502011-04-20 17:09:23 +020012671 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012673 if (PyTuple_Check(subobj)) {
12674 Py_ssize_t i;
12675 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012676 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012678 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012679 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012680 result = tailmatch(self, substring, start, end, +1);
12681 Py_DECREF(substring);
12682 if (result) {
12683 Py_RETURN_TRUE;
12684 }
12685 }
12686 Py_RETURN_FALSE;
12687 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012688 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012689 if (substring == NULL) {
12690 if (PyErr_ExceptionMatches(PyExc_TypeError))
12691 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12692 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012694 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012695 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698}
12699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012701
12702PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012704\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012705Return a formatted version of S, using substitutions from args and kwargs.\n\
12706The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012707
Eric Smith27bbca62010-11-04 17:06:58 +000012708PyDoc_STRVAR(format_map__doc__,
12709 "S.format_map(mapping) -> str\n\
12710\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012711Return a formatted version of S, using substitutions from mapping.\n\
12712The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012713
Eric Smith4a7d76d2008-05-30 18:10:19 +000012714static PyObject *
12715unicode__format__(PyObject* self, PyObject* args)
12716{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012717 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012718
12719 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12720 return NULL;
12721
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012722 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012724 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012725}
12726
Eric Smith8c663262007-08-25 02:26:07 +000012727PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012729\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012730Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012731
12732static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012733unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 Py_ssize_t size;
12736
12737 /* If it's a compact object, account for base structure +
12738 character data. */
12739 if (PyUnicode_IS_COMPACT_ASCII(v))
12740 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12741 else if (PyUnicode_IS_COMPACT(v))
12742 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012743 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 else {
12745 /* If it is a two-block object, account for base object, and
12746 for character block if present. */
12747 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012748 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012750 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 }
12752 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012753 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012754 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012756 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012757 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758
12759 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012760}
12761
12762PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012764
12765static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012766unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012767{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012768 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 if (!copy)
12770 return NULL;
12771 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012772}
12773
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774static PyMethodDef unicode_methods[] = {
12775
12776 /* Order is according to common usage: often used methods should
12777 appear first, since lookup is done sequentially. */
12778
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012779 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012780 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12781 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012782 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012783 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12784 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12785 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12786 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12787 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12788 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12789 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012791 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12792 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12793 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012794 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012795 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12796 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12797 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012798 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012800 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012801 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012802 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12803 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12804 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12805 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12806 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12807 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12808 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12809 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12810 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12811 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12812 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12813 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12814 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12815 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012816 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012817 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012818 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012819 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012820 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012821 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012822 {"maketrans", (PyCFunction) unicode_maketrans,
12823 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012824 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012825#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012826 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827#endif
12828
12829#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012830 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012831 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012832#endif
12833
Benjamin Peterson14339b62009-01-31 16:36:08 +000012834 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 {NULL, NULL}
12836};
12837
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012838static PyObject *
12839unicode_mod(PyObject *v, PyObject *w)
12840{
Brian Curtindfc80e32011-08-10 20:28:54 -050012841 if (!PyUnicode_Check(v))
12842 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012843 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012844}
12845
12846static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012847 0, /*nb_add*/
12848 0, /*nb_subtract*/
12849 0, /*nb_multiply*/
12850 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012851};
12852
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012854 (lenfunc) unicode_length, /* sq_length */
12855 PyUnicode_Concat, /* sq_concat */
12856 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12857 (ssizeargfunc) unicode_getitem, /* sq_item */
12858 0, /* sq_slice */
12859 0, /* sq_ass_item */
12860 0, /* sq_ass_slice */
12861 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862};
12863
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012864static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012865unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 if (PyUnicode_READY(self) == -1)
12868 return NULL;
12869
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012870 if (PyIndex_Check(item)) {
12871 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012872 if (i == -1 && PyErr_Occurred())
12873 return NULL;
12874 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012877 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012878 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012879 PyObject *result;
12880 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012881 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012882 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012886 return NULL;
12887 }
12888
12889 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012890 Py_INCREF(unicode_empty);
12891 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 } else if (start == 0 && step == 1 &&
12893 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012894 PyUnicode_CheckExact(self)) {
12895 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012896 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012897 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012898 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012899 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012900 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012901 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012902 src_kind = PyUnicode_KIND(self);
12903 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012904 if (!PyUnicode_IS_ASCII(self)) {
12905 kind_limit = kind_maxchar_limit(src_kind);
12906 max_char = 0;
12907 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12908 ch = PyUnicode_READ(src_kind, src_data, cur);
12909 if (ch > max_char) {
12910 max_char = ch;
12911 if (max_char >= kind_limit)
12912 break;
12913 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012914 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012915 }
Victor Stinner55c99112011-10-13 01:17:06 +020012916 else
12917 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012918 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012919 if (result == NULL)
12920 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012921 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012922 dest_data = PyUnicode_DATA(result);
12923
12924 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012925 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12926 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012927 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012928 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012929 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012930 } else {
12931 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12932 return NULL;
12933 }
12934}
12935
12936static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012937 (lenfunc)unicode_length, /* mp_length */
12938 (binaryfunc)unicode_subscript, /* mp_subscript */
12939 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012940};
12941
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943/* Helpers for PyUnicode_Format() */
12944
12945static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012946getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012948 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 (*p_argidx)++;
12951 if (arglen < 0)
12952 return args;
12953 else
12954 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955 }
12956 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958 return NULL;
12959}
12960
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012961/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012963static PyObject *
12964formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012966 char *p;
12967 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012969
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970 x = PyFloat_AsDouble(v);
12971 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012972 return NULL;
12973
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012976
Eric Smith0923d1d2009-04-16 20:16:10 +000012977 p = PyOS_double_to_string(x, type, prec,
12978 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012979 if (p == NULL)
12980 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012982 PyMem_Free(p);
12983 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984}
12985
Tim Peters38fd5b62000-09-21 05:43:11 +000012986static PyObject*
12987formatlong(PyObject *val, int flags, int prec, int type)
12988{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012989 char *buf;
12990 int len;
12991 PyObject *str; /* temporary string object. */
12992 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012993
Benjamin Peterson14339b62009-01-31 16:36:08 +000012994 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12995 if (!str)
12996 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 Py_DECREF(str);
12999 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013000}
13001
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013002static Py_UCS4
13003formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013005 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013006 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013008 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 goto onError;
13011 }
13012 else {
13013 /* Integer input truncated to a character */
13014 long x;
13015 x = PyLong_AsLong(v);
13016 if (x == -1 && PyErr_Occurred())
13017 goto onError;
13018
Victor Stinner8faf8212011-12-08 22:14:11 +010013019 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013020 PyErr_SetString(PyExc_OverflowError,
13021 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013022 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013023 }
13024
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013025 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013027
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013029 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013031 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032}
13033
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013034static int
13035repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13036{
13037 int r;
13038 assert(count > 0);
13039 assert(PyUnicode_Check(obj));
13040 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013041 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013042 if (repeated == NULL)
13043 return -1;
13044 r = _PyAccu_Accumulate(acc, repeated);
13045 Py_DECREF(repeated);
13046 return r;
13047 }
13048 else {
13049 do {
13050 if (_PyAccu_Accumulate(acc, obj))
13051 return -1;
13052 } while (--count);
13053 return 0;
13054 }
13055}
13056
Alexander Belopolsky40018472011-02-26 01:02:56 +000013057PyObject *
13058PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 void *fmt;
13061 int fmtkind;
13062 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013064 int r;
13065 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013068 PyObject *temp = NULL;
13069 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013070 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013071 _PyAccu acc;
13072 static PyObject *plus, *minus, *blank, *zero, *percent;
13073
13074 if (!plus && !(plus = get_latin1_char('+')))
13075 return NULL;
13076 if (!minus && !(minus = get_latin1_char('-')))
13077 return NULL;
13078 if (!blank && !(blank = get_latin1_char(' ')))
13079 return NULL;
13080 if (!zero && !(zero = get_latin1_char('0')))
13081 return NULL;
13082 if (!percent && !(percent = get_latin1_char('%')))
13083 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013084
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 PyErr_BadInternalCall();
13087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013089 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013092 if (_PyAccu_Init(&acc))
13093 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 fmt = PyUnicode_DATA(uformat);
13095 fmtkind = PyUnicode_KIND(uformat);
13096 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13097 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 arglen = PyTuple_Size(args);
13101 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102 }
13103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 arglen = -1;
13105 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013107 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013108 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
13111 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013113 PyObject *nonfmt;
13114 Py_ssize_t nonfmtpos;
13115 nonfmtpos = fmtpos++;
13116 while (fmtcnt >= 0 &&
13117 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13118 fmtpos++;
13119 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013120 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013121 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013122 if (nonfmt == NULL)
13123 goto onError;
13124 r = _PyAccu_Accumulate(&acc, nonfmt);
13125 Py_DECREF(nonfmt);
13126 if (r)
13127 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013128 }
13129 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 /* Got a format specifier */
13131 int flags = 0;
13132 Py_ssize_t width = -1;
13133 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013135 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 int isnumok;
13137 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013138 void *pbuf = NULL;
13139 Py_ssize_t pindex, len;
13140 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 fmtpos++;
13143 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13144 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 Py_ssize_t keylen;
13146 PyObject *key;
13147 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013148
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 if (dict == NULL) {
13150 PyErr_SetString(PyExc_TypeError,
13151 "format requires a mapping");
13152 goto onError;
13153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 /* Skip over balanced parentheses */
13158 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013159 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013161 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013162 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 if (fmtcnt < 0 || pcount > 0) {
13167 PyErr_SetString(PyExc_ValueError,
13168 "incomplete format key");
13169 goto onError;
13170 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013171 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013172 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 if (key == NULL)
13174 goto onError;
13175 if (args_owned) {
13176 Py_DECREF(args);
13177 args_owned = 0;
13178 }
13179 args = PyObject_GetItem(dict, key);
13180 Py_DECREF(key);
13181 if (args == NULL) {
13182 goto onError;
13183 }
13184 args_owned = 1;
13185 arglen = -1;
13186 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013187 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 case '-': flags |= F_LJUST; continue;
13191 case '+': flags |= F_SIGN; continue;
13192 case ' ': flags |= F_BLANK; continue;
13193 case '#': flags |= F_ALT; continue;
13194 case '0': flags |= F_ZERO; continue;
13195 }
13196 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013197 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 if (c == '*') {
13199 v = getnextarg(args, arglen, &argidx);
13200 if (v == NULL)
13201 goto onError;
13202 if (!PyLong_Check(v)) {
13203 PyErr_SetString(PyExc_TypeError,
13204 "* wants int");
13205 goto onError;
13206 }
13207 width = PyLong_AsLong(v);
13208 if (width == -1 && PyErr_Occurred())
13209 goto onError;
13210 if (width < 0) {
13211 flags |= F_LJUST;
13212 width = -width;
13213 }
13214 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 }
13217 else if (c >= '0' && c <= '9') {
13218 width = c - '0';
13219 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013221 if (c < '0' || c > '9')
13222 break;
13223 if ((width*10) / 10 != width) {
13224 PyErr_SetString(PyExc_ValueError,
13225 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013226 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 }
13228 width = width*10 + (c - '0');
13229 }
13230 }
13231 if (c == '.') {
13232 prec = 0;
13233 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 if (c == '*') {
13236 v = getnextarg(args, arglen, &argidx);
13237 if (v == NULL)
13238 goto onError;
13239 if (!PyLong_Check(v)) {
13240 PyErr_SetString(PyExc_TypeError,
13241 "* wants int");
13242 goto onError;
13243 }
13244 prec = PyLong_AsLong(v);
13245 if (prec == -1 && PyErr_Occurred())
13246 goto onError;
13247 if (prec < 0)
13248 prec = 0;
13249 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 }
13252 else if (c >= '0' && c <= '9') {
13253 prec = c - '0';
13254 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013255 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 if (c < '0' || c > '9')
13257 break;
13258 if ((prec*10) / 10 != prec) {
13259 PyErr_SetString(PyExc_ValueError,
13260 "prec too big");
13261 goto onError;
13262 }
13263 prec = prec*10 + (c - '0');
13264 }
13265 }
13266 } /* prec */
13267 if (fmtcnt >= 0) {
13268 if (c == 'h' || c == 'l' || c == 'L') {
13269 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 }
13272 }
13273 if (fmtcnt < 0) {
13274 PyErr_SetString(PyExc_ValueError,
13275 "incomplete format");
13276 goto onError;
13277 }
13278 if (c != '%') {
13279 v = getnextarg(args, arglen, &argidx);
13280 if (v == NULL)
13281 goto onError;
13282 }
13283 sign = 0;
13284 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013285 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 switch (c) {
13287
13288 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013289 _PyAccu_Accumulate(&acc, percent);
13290 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013291
13292 case 's':
13293 case 'r':
13294 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013295 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 temp = v;
13297 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 }
13299 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 if (c == 's')
13301 temp = PyObject_Str(v);
13302 else if (c == 'r')
13303 temp = PyObject_Repr(v);
13304 else
13305 temp = PyObject_ASCII(v);
13306 if (temp == NULL)
13307 goto onError;
13308 if (PyUnicode_Check(temp))
13309 /* nothing to do */;
13310 else {
13311 Py_DECREF(temp);
13312 PyErr_SetString(PyExc_TypeError,
13313 "%s argument has non-string str()");
13314 goto onError;
13315 }
13316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 if (PyUnicode_READY(temp) == -1) {
13318 Py_CLEAR(temp);
13319 goto onError;
13320 }
13321 pbuf = PyUnicode_DATA(temp);
13322 kind = PyUnicode_KIND(temp);
13323 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013324 if (prec >= 0 && len > prec)
13325 len = prec;
13326 break;
13327
13328 case 'i':
13329 case 'd':
13330 case 'u':
13331 case 'o':
13332 case 'x':
13333 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 isnumok = 0;
13335 if (PyNumber_Check(v)) {
13336 PyObject *iobj=NULL;
13337
13338 if (PyLong_Check(v)) {
13339 iobj = v;
13340 Py_INCREF(iobj);
13341 }
13342 else {
13343 iobj = PyNumber_Long(v);
13344 }
13345 if (iobj!=NULL) {
13346 if (PyLong_Check(iobj)) {
13347 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013348 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 Py_DECREF(iobj);
13350 if (!temp)
13351 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013352 if (PyUnicode_READY(temp) == -1) {
13353 Py_CLEAR(temp);
13354 goto onError;
13355 }
13356 pbuf = PyUnicode_DATA(temp);
13357 kind = PyUnicode_KIND(temp);
13358 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 sign = 1;
13360 }
13361 else {
13362 Py_DECREF(iobj);
13363 }
13364 }
13365 }
13366 if (!isnumok) {
13367 PyErr_Format(PyExc_TypeError,
13368 "%%%c format: a number is required, "
13369 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13370 goto onError;
13371 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013372 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013374 fillobj = zero;
13375 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 break;
13377
13378 case 'e':
13379 case 'E':
13380 case 'f':
13381 case 'F':
13382 case 'g':
13383 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013384 temp = formatfloat(v, flags, prec, c);
13385 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 if (PyUnicode_READY(temp) == -1) {
13388 Py_CLEAR(temp);
13389 goto onError;
13390 }
13391 pbuf = PyUnicode_DATA(temp);
13392 kind = PyUnicode_KIND(temp);
13393 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013395 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 fillobj = zero;
13398 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 break;
13400
13401 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 {
13403 Py_UCS4 ch = formatchar(v);
13404 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013406 temp = _PyUnicode_FromUCS4(&ch, 1);
13407 if (temp == NULL)
13408 goto onError;
13409 pbuf = PyUnicode_DATA(temp);
13410 kind = PyUnicode_KIND(temp);
13411 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013414
13415 default:
13416 PyErr_Format(PyExc_ValueError,
13417 "unsupported format character '%c' (0x%x) "
13418 "at index %zd",
13419 (31<=c && c<=126) ? (char)c : '?',
13420 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 goto onError;
13423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 /* pbuf is initialized here. */
13425 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013427 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13428 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013430 pindex++;
13431 }
13432 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13433 signobj = plus;
13434 len--;
13435 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 }
13437 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013438 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013440 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 else
13442 sign = 0;
13443 }
13444 if (width < len)
13445 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013446 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013447 if (fill != ' ') {
13448 assert(signobj != NULL);
13449 if (_PyAccu_Accumulate(&acc, signobj))
13450 goto onError;
13451 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 if (width > len)
13453 width--;
13454 }
13455 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013457 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013459 second = get_latin1_char(
13460 PyUnicode_READ(kind, pbuf, pindex + 1));
13461 pindex += 2;
13462 if (second == NULL ||
13463 _PyAccu_Accumulate(&acc, zero) ||
13464 _PyAccu_Accumulate(&acc, second))
13465 goto onError;
13466 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013467 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 width -= 2;
13469 if (width < 0)
13470 width = 0;
13471 len -= 2;
13472 }
13473 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013475 if (repeat_accumulate(&acc, fillobj, width - len))
13476 goto onError;
13477 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 }
13479 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013480 if (sign) {
13481 assert(signobj != NULL);
13482 if (_PyAccu_Accumulate(&acc, signobj))
13483 goto onError;
13484 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13487 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013488 second = get_latin1_char(
13489 PyUnicode_READ(kind, pbuf, pindex + 1));
13490 pindex += 2;
13491 if (second == NULL ||
13492 _PyAccu_Accumulate(&acc, zero) ||
13493 _PyAccu_Accumulate(&acc, second))
13494 goto onError;
13495 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013496 }
13497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013499 if (temp != NULL) {
13500 assert(pbuf == PyUnicode_DATA(temp));
13501 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013503 else {
13504 const char *p = (const char *) pbuf;
13505 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013506 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013507 v = PyUnicode_FromKindAndData(kind, p, len);
13508 }
13509 if (v == NULL)
13510 goto onError;
13511 r = _PyAccu_Accumulate(&acc, v);
13512 Py_DECREF(v);
13513 if (r)
13514 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013515 if (width > len && repeat_accumulate(&acc, blank, width - len))
13516 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 if (dict && (argidx < arglen) && c != '%') {
13518 PyErr_SetString(PyExc_TypeError,
13519 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 goto onError;
13521 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013524 } /* until end */
13525 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 PyErr_SetString(PyExc_TypeError,
13527 "not all arguments converted during string formatting");
13528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529 }
13530
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013531 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013532 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013534 }
13535 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013536 Py_XDECREF(temp);
13537 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013538 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013539
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 Py_XDECREF(temp);
13543 Py_XDECREF(second);
13544 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547 }
13548 return NULL;
13549}
13550
Jeremy Hylton938ace62002-07-17 16:30:39 +000013551static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013552unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13553
Tim Peters6d6c1a32001-08-02 04:15:00 +000013554static PyObject *
13555unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13556{
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013558 static char *kwlist[] = {"object", "encoding", "errors", 0};
13559 char *encoding = NULL;
13560 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013561
Benjamin Peterson14339b62009-01-31 16:36:08 +000013562 if (type != &PyUnicode_Type)
13563 return unicode_subtype_new(type, args, kwds);
13564 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013566 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013567 if (x == NULL) {
13568 Py_INCREF(unicode_empty);
13569 return unicode_empty;
13570 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013571 if (encoding == NULL && errors == NULL)
13572 return PyObject_Str(x);
13573 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013575}
13576
Guido van Rossume023fe02001-08-30 03:12:59 +000013577static PyObject *
13578unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13579{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013580 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013581 Py_ssize_t length, char_size;
13582 int share_wstr, share_utf8;
13583 unsigned int kind;
13584 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013585
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013587
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013588 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013589 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013590 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013591 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013592 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013593 return NULL;
13594
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013595 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013596 if (self == NULL) {
13597 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013598 return NULL;
13599 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013600 kind = PyUnicode_KIND(unicode);
13601 length = PyUnicode_GET_LENGTH(unicode);
13602
13603 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013604#ifdef Py_DEBUG
13605 _PyUnicode_HASH(self) = -1;
13606#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013607 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013608#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013609 _PyUnicode_STATE(self).interned = 0;
13610 _PyUnicode_STATE(self).kind = kind;
13611 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013612 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013613 _PyUnicode_STATE(self).ready = 1;
13614 _PyUnicode_WSTR(self) = NULL;
13615 _PyUnicode_UTF8_LENGTH(self) = 0;
13616 _PyUnicode_UTF8(self) = NULL;
13617 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013618 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013619
13620 share_utf8 = 0;
13621 share_wstr = 0;
13622 if (kind == PyUnicode_1BYTE_KIND) {
13623 char_size = 1;
13624 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13625 share_utf8 = 1;
13626 }
13627 else if (kind == PyUnicode_2BYTE_KIND) {
13628 char_size = 2;
13629 if (sizeof(wchar_t) == 2)
13630 share_wstr = 1;
13631 }
13632 else {
13633 assert(kind == PyUnicode_4BYTE_KIND);
13634 char_size = 4;
13635 if (sizeof(wchar_t) == 4)
13636 share_wstr = 1;
13637 }
13638
13639 /* Ensure we won't overflow the length. */
13640 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13641 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013643 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013644 data = PyObject_MALLOC((length + 1) * char_size);
13645 if (data == NULL) {
13646 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013647 goto onError;
13648 }
13649
Victor Stinnerc3c74152011-10-02 20:39:55 +020013650 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013651 if (share_utf8) {
13652 _PyUnicode_UTF8_LENGTH(self) = length;
13653 _PyUnicode_UTF8(self) = data;
13654 }
13655 if (share_wstr) {
13656 _PyUnicode_WSTR_LENGTH(self) = length;
13657 _PyUnicode_WSTR(self) = (wchar_t *)data;
13658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013659
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013660 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013661 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013662 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013663#ifdef Py_DEBUG
13664 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13665#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013666 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013667 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013668
13669onError:
13670 Py_DECREF(unicode);
13671 Py_DECREF(self);
13672 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013673}
13674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013675PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013677\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013678Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013679encoding defaults to the current default string encoding.\n\
13680errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013681
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013682static PyObject *unicode_iter(PyObject *seq);
13683
Guido van Rossumd57fd912000-03-10 22:53:23 +000013684PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013685 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686 "str", /* tp_name */
13687 sizeof(PyUnicodeObject), /* tp_size */
13688 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013690 (destructor)unicode_dealloc, /* tp_dealloc */
13691 0, /* tp_print */
13692 0, /* tp_getattr */
13693 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013694 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 unicode_repr, /* tp_repr */
13696 &unicode_as_number, /* tp_as_number */
13697 &unicode_as_sequence, /* tp_as_sequence */
13698 &unicode_as_mapping, /* tp_as_mapping */
13699 (hashfunc) unicode_hash, /* tp_hash*/
13700 0, /* tp_call*/
13701 (reprfunc) unicode_str, /* tp_str */
13702 PyObject_GenericGetAttr, /* tp_getattro */
13703 0, /* tp_setattro */
13704 0, /* tp_as_buffer */
13705 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013706 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013707 unicode_doc, /* tp_doc */
13708 0, /* tp_traverse */
13709 0, /* tp_clear */
13710 PyUnicode_RichCompare, /* tp_richcompare */
13711 0, /* tp_weaklistoffset */
13712 unicode_iter, /* tp_iter */
13713 0, /* tp_iternext */
13714 unicode_methods, /* tp_methods */
13715 0, /* tp_members */
13716 0, /* tp_getset */
13717 &PyBaseObject_Type, /* tp_base */
13718 0, /* tp_dict */
13719 0, /* tp_descr_get */
13720 0, /* tp_descr_set */
13721 0, /* tp_dictoffset */
13722 0, /* tp_init */
13723 0, /* tp_alloc */
13724 unicode_new, /* tp_new */
13725 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726};
13727
13728/* Initialize the Unicode implementation */
13729
Victor Stinner3a50e702011-10-18 21:21:00 +020013730int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013732 int i;
13733
Thomas Wouters477c8d52006-05-27 19:21:47 +000013734 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013735 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013736 0x000A, /* LINE FEED */
13737 0x000D, /* CARRIAGE RETURN */
13738 0x001C, /* FILE SEPARATOR */
13739 0x001D, /* GROUP SEPARATOR */
13740 0x001E, /* RECORD SEPARATOR */
13741 0x0085, /* NEXT LINE */
13742 0x2028, /* LINE SEPARATOR */
13743 0x2029, /* PARAGRAPH SEPARATOR */
13744 };
13745
Fred Drakee4315f52000-05-09 19:53:39 +000013746 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013747 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013748 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013749 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013750 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013751
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013752 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013754 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013756
13757 /* initialize the linebreak bloom filter */
13758 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013759 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013760 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013761
13762 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013763
13764#ifdef HAVE_MBCS
13765 winver.dwOSVersionInfoSize = sizeof(winver);
13766 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13767 PyErr_SetFromWindowsErr(0);
13768 return -1;
13769 }
13770#endif
13771 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772}
13773
13774/* Finalize the Unicode implementation */
13775
Christian Heimesa156e092008-02-16 07:38:31 +000013776int
13777PyUnicode_ClearFreeList(void)
13778{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013780}
13781
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782void
Thomas Wouters78890102000-07-22 19:25:51 +000013783_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013784{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013785 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013787 Py_XDECREF(unicode_empty);
13788 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013789
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013790 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013791 if (unicode_latin1[i]) {
13792 Py_DECREF(unicode_latin1[i]);
13793 unicode_latin1[i] = NULL;
13794 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013795 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013796 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013797 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013799
Walter Dörwald16807132007-05-25 13:52:07 +000013800void
13801PyUnicode_InternInPlace(PyObject **p)
13802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013803 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013804 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013805#ifdef Py_DEBUG
13806 assert(s != NULL);
13807 assert(_PyUnicode_CHECK(s));
13808#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013810 return;
13811#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013812 /* If it's a subclass, we don't really know what putting
13813 it in the interned dict might do. */
13814 if (!PyUnicode_CheckExact(s))
13815 return;
13816 if (PyUnicode_CHECK_INTERNED(s))
13817 return;
13818 if (interned == NULL) {
13819 interned = PyDict_New();
13820 if (interned == NULL) {
13821 PyErr_Clear(); /* Don't leave an exception */
13822 return;
13823 }
13824 }
13825 /* It might be that the GetItem call fails even
13826 though the key is present in the dictionary,
13827 namely when this happens during a stack overflow. */
13828 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013829 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013830 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013831
Benjamin Peterson29060642009-01-31 22:14:21 +000013832 if (t) {
13833 Py_INCREF(t);
13834 Py_DECREF(*p);
13835 *p = t;
13836 return;
13837 }
Walter Dörwald16807132007-05-25 13:52:07 +000013838
Benjamin Peterson14339b62009-01-31 16:36:08 +000013839 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013840 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 PyErr_Clear();
13842 PyThreadState_GET()->recursion_critical = 0;
13843 return;
13844 }
13845 PyThreadState_GET()->recursion_critical = 0;
13846 /* The two references in interned are not counted by refcnt.
13847 The deallocator will take care of this */
13848 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013850}
13851
13852void
13853PyUnicode_InternImmortal(PyObject **p)
13854{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013855 PyUnicode_InternInPlace(p);
13856 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013857 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013858 Py_INCREF(*p);
13859 }
Walter Dörwald16807132007-05-25 13:52:07 +000013860}
13861
13862PyObject *
13863PyUnicode_InternFromString(const char *cp)
13864{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013865 PyObject *s = PyUnicode_FromString(cp);
13866 if (s == NULL)
13867 return NULL;
13868 PyUnicode_InternInPlace(&s);
13869 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013870}
13871
Alexander Belopolsky40018472011-02-26 01:02:56 +000013872void
13873_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013876 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 Py_ssize_t i, n;
13878 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013879
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 if (interned == NULL || !PyDict_Check(interned))
13881 return;
13882 keys = PyDict_Keys(interned);
13883 if (keys == NULL || !PyList_Check(keys)) {
13884 PyErr_Clear();
13885 return;
13886 }
Walter Dörwald16807132007-05-25 13:52:07 +000013887
Benjamin Peterson14339b62009-01-31 16:36:08 +000013888 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13889 detector, interned unicode strings are not forcibly deallocated;
13890 rather, we give them their stolen references back, and then clear
13891 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013892
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 n = PyList_GET_SIZE(keys);
13894 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013896 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013897 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013898 if (PyUnicode_READY(s) == -1) {
13899 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013900 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013903 case SSTATE_NOT_INTERNED:
13904 /* XXX Shouldn't happen */
13905 break;
13906 case SSTATE_INTERNED_IMMORTAL:
13907 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013908 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013909 break;
13910 case SSTATE_INTERNED_MORTAL:
13911 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013912 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013913 break;
13914 default:
13915 Py_FatalError("Inconsistent interned string state.");
13916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013917 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 }
13919 fprintf(stderr, "total size of all interned strings: "
13920 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13921 "mortal/immortal\n", mortal_size, immortal_size);
13922 Py_DECREF(keys);
13923 PyDict_Clear(interned);
13924 Py_DECREF(interned);
13925 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013926}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013927
13928
13929/********************* Unicode Iterator **************************/
13930
13931typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013932 PyObject_HEAD
13933 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013935} unicodeiterobject;
13936
13937static void
13938unicodeiter_dealloc(unicodeiterobject *it)
13939{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 _PyObject_GC_UNTRACK(it);
13941 Py_XDECREF(it->it_seq);
13942 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013943}
13944
13945static int
13946unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13947{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013948 Py_VISIT(it->it_seq);
13949 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013950}
13951
13952static PyObject *
13953unicodeiter_next(unicodeiterobject *it)
13954{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013955 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013956
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 assert(it != NULL);
13958 seq = it->it_seq;
13959 if (seq == NULL)
13960 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013961 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13964 int kind = PyUnicode_KIND(seq);
13965 void *data = PyUnicode_DATA(seq);
13966 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13967 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 if (item != NULL)
13969 ++it->it_index;
13970 return item;
13971 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 Py_DECREF(seq);
13974 it->it_seq = NULL;
13975 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013976}
13977
13978static PyObject *
13979unicodeiter_len(unicodeiterobject *it)
13980{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013981 Py_ssize_t len = 0;
13982 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013983 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013985}
13986
13987PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13988
13989static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013993};
13994
13995PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013996 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13997 "str_iterator", /* tp_name */
13998 sizeof(unicodeiterobject), /* tp_basicsize */
13999 0, /* tp_itemsize */
14000 /* methods */
14001 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14002 0, /* tp_print */
14003 0, /* tp_getattr */
14004 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014005 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 0, /* tp_repr */
14007 0, /* tp_as_number */
14008 0, /* tp_as_sequence */
14009 0, /* tp_as_mapping */
14010 0, /* tp_hash */
14011 0, /* tp_call */
14012 0, /* tp_str */
14013 PyObject_GenericGetAttr, /* tp_getattro */
14014 0, /* tp_setattro */
14015 0, /* tp_as_buffer */
14016 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14017 0, /* tp_doc */
14018 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14019 0, /* tp_clear */
14020 0, /* tp_richcompare */
14021 0, /* tp_weaklistoffset */
14022 PyObject_SelfIter, /* tp_iter */
14023 (iternextfunc)unicodeiter_next, /* tp_iternext */
14024 unicodeiter_methods, /* tp_methods */
14025 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014026};
14027
14028static PyObject *
14029unicode_iter(PyObject *seq)
14030{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014032
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 if (!PyUnicode_Check(seq)) {
14034 PyErr_BadInternalCall();
14035 return NULL;
14036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014037 if (PyUnicode_READY(seq) == -1)
14038 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14040 if (it == NULL)
14041 return NULL;
14042 it->it_index = 0;
14043 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014044 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 _PyObject_GC_TRACK(it);
14046 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014047}
14048
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014049
14050size_t
14051Py_UNICODE_strlen(const Py_UNICODE *u)
14052{
14053 int res = 0;
14054 while(*u++)
14055 res++;
14056 return res;
14057}
14058
14059Py_UNICODE*
14060Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14061{
14062 Py_UNICODE *u = s1;
14063 while ((*u++ = *s2++));
14064 return s1;
14065}
14066
14067Py_UNICODE*
14068Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14069{
14070 Py_UNICODE *u = s1;
14071 while ((*u++ = *s2++))
14072 if (n-- == 0)
14073 break;
14074 return s1;
14075}
14076
14077Py_UNICODE*
14078Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14079{
14080 Py_UNICODE *u1 = s1;
14081 u1 += Py_UNICODE_strlen(u1);
14082 Py_UNICODE_strcpy(u1, s2);
14083 return s1;
14084}
14085
14086int
14087Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14088{
14089 while (*s1 && *s2 && *s1 == *s2)
14090 s1++, s2++;
14091 if (*s1 && *s2)
14092 return (*s1 < *s2) ? -1 : +1;
14093 if (*s1)
14094 return 1;
14095 if (*s2)
14096 return -1;
14097 return 0;
14098}
14099
14100int
14101Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14102{
14103 register Py_UNICODE u1, u2;
14104 for (; n != 0; n--) {
14105 u1 = *s1;
14106 u2 = *s2;
14107 if (u1 != u2)
14108 return (u1 < u2) ? -1 : +1;
14109 if (u1 == '\0')
14110 return 0;
14111 s1++;
14112 s2++;
14113 }
14114 return 0;
14115}
14116
14117Py_UNICODE*
14118Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14119{
14120 const Py_UNICODE *p;
14121 for (p = s; *p; p++)
14122 if (*p == c)
14123 return (Py_UNICODE*)p;
14124 return NULL;
14125}
14126
14127Py_UNICODE*
14128Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14129{
14130 const Py_UNICODE *p;
14131 p = s + Py_UNICODE_strlen(s);
14132 while (p != s) {
14133 p--;
14134 if (*p == c)
14135 return (Py_UNICODE*)p;
14136 }
14137 return NULL;
14138}
Victor Stinner331ea922010-08-10 16:37:20 +000014139
Victor Stinner71133ff2010-09-01 23:43:53 +000014140Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014141PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014142{
Victor Stinner577db2c2011-10-11 22:12:48 +020014143 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014144 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014146 if (!PyUnicode_Check(unicode)) {
14147 PyErr_BadArgument();
14148 return NULL;
14149 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014150 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014151 if (u == NULL)
14152 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014153 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014154 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014155 PyErr_NoMemory();
14156 return NULL;
14157 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014158 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014159 size *= sizeof(Py_UNICODE);
14160 copy = PyMem_Malloc(size);
14161 if (copy == NULL) {
14162 PyErr_NoMemory();
14163 return NULL;
14164 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014165 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014166 return copy;
14167}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014168
Georg Brandl66c221e2010-10-14 07:04:07 +000014169/* A _string module, to export formatter_parser and formatter_field_name_split
14170 to the string.Formatter class implemented in Python. */
14171
14172static PyMethodDef _string_methods[] = {
14173 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14174 METH_O, PyDoc_STR("split the argument as a field name")},
14175 {"formatter_parser", (PyCFunction) formatter_parser,
14176 METH_O, PyDoc_STR("parse the argument as a format string")},
14177 {NULL, NULL}
14178};
14179
14180static struct PyModuleDef _string_module = {
14181 PyModuleDef_HEAD_INIT,
14182 "_string",
14183 PyDoc_STR("string helper module"),
14184 0,
14185 _string_methods,
14186 NULL,
14187 NULL,
14188 NULL,
14189 NULL
14190};
14191
14192PyMODINIT_FUNC
14193PyInit__string(void)
14194{
14195 return PyModule_Create(&_string_module);
14196}
14197
14198
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014199#ifdef __cplusplus
14200}
14201#endif