blob: cc4da2a18df96a07e53bb1c9487d4b07cbb3e52a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinner3a50e702011-10-18 21:21:00 +0200490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
Thomas Wouters477c8d52006-05-27 19:21:47 +0000494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497 to keep things simple, we use a single bitmask, using the least 5
498 bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
Antoine Pitrouf068f942010-01-13 14:19:12 +0000502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518
Benjamin Peterson29060642009-01-31 22:14:21 +0000519#define BLOOM_LINEBREAK(ch) \
520 ((ch) < 128U ? ascii_linebreak[(ch)] : \
521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522
Alexander Belopolsky40018472011-02-26 01:02:56 +0000523Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525{
526 /* calculate simple bloom-style bitmask for a given unicode string */
527
Antoine Pitrouf068f942010-01-13 14:19:12 +0000528 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 Py_ssize_t i;
530
531 mask = 0;
532 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534
535 return mask;
536}
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#define BLOOM_MEMBER(mask, chr, str) \
539 (BLOOM(mask, chr) \
540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100588#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590/* --- Unicode Object ----------------------------------------------------- */
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596 Py_ssize_t size, Py_UCS4 ch,
597 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601 switch (kind) {
602 case PyUnicode_1BYTE_KIND:
603 {
604 Py_UCS1 ch1 = (Py_UCS1) ch;
605 if (ch1 == ch)
606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607 else
608 return -1;
609 }
610 case PyUnicode_2BYTE_KIND:
611 {
612 Py_UCS2 ch2 = (Py_UCS2) ch;
613 if (ch2 == ch)
614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615 else
616 return -1;
617 }
618 case PyUnicode_4BYTE_KIND:
619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620 default:
621 assert(0);
622 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624}
625
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629 Py_ssize_t char_size;
630 Py_ssize_t struct_size;
631 Py_ssize_t new_size;
632 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100633 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200634
635 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200636 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 if (PyUnicode_IS_COMPACT_ASCII(unicode))
638 struct_size = sizeof(PyASCIIObject);
639 else
640 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200641 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100644 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
Victor Stinner84def372011-12-11 20:04:56 +0100650 _Py_DEC_REFTOTAL;
651 _Py_ForgetReference(unicode);
652
653 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
654 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200655 PyObject_Del(unicode);
656 PyErr_NoMemory();
657 return NULL;
658 }
Victor Stinner84def372011-12-11 20:04:56 +0100659 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200663 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200664 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
666 _PyUnicode_WSTR_LENGTH(unicode) = length;
667 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
669 length, 0);
670 return unicode;
671}
672
Alexander Belopolsky40018472011-02-26 01:02:56 +0000673static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200674resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675{
Victor Stinner95663112011-10-04 01:03:50 +0200676 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000679
Victor Stinner95663112011-10-04 01:03:50 +0200680 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681
682 if (PyUnicode_IS_READY(unicode)) {
683 Py_ssize_t char_size;
684 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200685 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 void *data;
687
688 data = _PyUnicode_DATA_ANY(unicode);
689 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200690 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200691 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
692 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200693 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
694 {
695 PyObject_DEL(_PyUnicode_UTF8(unicode));
696 _PyUnicode_UTF8(unicode) = NULL;
697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
698 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699
700 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
701 PyErr_NoMemory();
702 return -1;
703 }
704 new_size = (length + 1) * char_size;
705
706 data = (PyObject *)PyObject_REALLOC(data, new_size);
707 if (data == NULL) {
708 PyErr_NoMemory();
709 return -1;
710 }
711 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200712 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 _PyUnicode_WSTR_LENGTH(unicode) = length;
715 }
716 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200717 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_UTF8_LENGTH(unicode) = length;
719 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 _PyUnicode_LENGTH(unicode) = length;
721 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200722 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200723 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 }
Victor Stinner95663112011-10-04 01:03:50 +0200727 assert(_PyUnicode_WSTR(unicode) != NULL);
728
729 /* check for integer overflow */
730 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 wstr = _PyUnicode_WSTR(unicode);
735 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
736 if (!wstr) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 _PyUnicode_WSTR(unicode) = wstr;
741 _PyUnicode_WSTR(unicode)[length] = 0;
742 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200743 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 return 0;
745}
746
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747static PyObject*
748resize_copy(PyObject *unicode, Py_ssize_t length)
749{
750 Py_ssize_t copy_length;
751 if (PyUnicode_IS_COMPACT(unicode)) {
752 PyObject *copy;
753 assert(PyUnicode_IS_READY(unicode));
754
755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
756 if (copy == NULL)
757 return NULL;
758
759 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200760 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200762 }
763 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 assert(_PyUnicode_WSTR(unicode) != NULL);
766 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200767 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (w == NULL)
769 return NULL;
770 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
771 copy_length = Py_MIN(copy_length, length);
772 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
773 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200774 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
776}
777
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000779 Ux0000 terminated; some code (e.g. new_identifier)
780 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781
782 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000783 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784
785*/
786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200788static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789#endif
790
Alexander Belopolsky40018472011-02-26 01:02:56 +0000791static PyUnicodeObject *
792_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793{
794 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798 if (length == 0 && unicode_empty != NULL) {
799 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200800 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801 }
802
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000803 /* Ensure we won't overflow the size. */
804 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
805 return (PyUnicodeObject *)PyErr_NoMemory();
806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (length < 0) {
808 PyErr_SetString(PyExc_SystemError,
809 "Negative size passed to _PyUnicode_New");
810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 }
812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813#ifdef Py_DEBUG
814 ++unicode_old_new_calls;
815#endif
816
817 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
818 if (unicode == NULL)
819 return NULL;
820 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
821 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
822 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000823 PyErr_NoMemory();
824 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826
Jeremy Hyltond8082792003-09-16 19:41:39 +0000827 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000828 * the caller fails before initializing str -- unicode_resize()
829 * reads str[0], and the Keep-Alive optimization can keep memory
830 * allocated for str alive across a call to unicode_dealloc(unicode).
831 * We don't want unicode_resize to read uninitialized memory in
832 * that case.
833 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 _PyUnicode_WSTR(unicode)[0] = 0;
835 _PyUnicode_WSTR(unicode)[length] = 0;
836 _PyUnicode_WSTR_LENGTH(unicode) = length;
837 _PyUnicode_HASH(unicode) = -1;
838 _PyUnicode_STATE(unicode).interned = 0;
839 _PyUnicode_STATE(unicode).kind = 0;
840 _PyUnicode_STATE(unicode).compact = 0;
841 _PyUnicode_STATE(unicode).ready = 0;
842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200843 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = NULL;
846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100847 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000849
Benjamin Peterson29060642009-01-31 22:14:21 +0000850 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000851 /* XXX UNREF/NEWREF interface should be more symmetrical */
852 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000853 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000854 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856}
857
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858static const char*
859unicode_kind_name(PyObject *unicode)
860{
Victor Stinner42dfd712011-10-03 14:41:45 +0200861 /* don't check consistency: unicode_kind_name() is called from
862 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863 if (!PyUnicode_IS_COMPACT(unicode))
864 {
865 if (!PyUnicode_IS_READY(unicode))
866 return "wstr";
867 switch(PyUnicode_KIND(unicode))
868 {
869 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200870 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 return "legacy ascii";
872 else
873 return "legacy latin1";
874 case PyUnicode_2BYTE_KIND:
875 return "legacy UCS2";
876 case PyUnicode_4BYTE_KIND:
877 return "legacy UCS4";
878 default:
879 return "<legacy invalid kind>";
880 }
881 }
882 assert(PyUnicode_IS_READY(unicode));
883 switch(PyUnicode_KIND(unicode))
884 {
885 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 return "ascii";
888 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200891 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 default:
895 return "<invalid compact kind>";
896 }
897}
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200900static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901
902/* Functions wrapping macros for use in debugger */
903char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905}
906
907void *_PyUnicode_compact_data(void *unicode) {
908 return _PyUnicode_COMPACT_DATA(unicode);
909}
910void *_PyUnicode_data(void *unicode){
911 printf("obj %p\n", unicode);
912 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
913 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
914 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
915 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
916 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
917 return PyUnicode_DATA(unicode);
918}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200919
920void
921_PyUnicode_Dump(PyObject *op)
922{
923 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200924 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
925 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
926 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200927
Victor Stinnera849a4b2011-10-03 12:12:11 +0200928 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200929 {
930 if (ascii->state.ascii)
931 data = (ascii + 1);
932 else
933 data = (compact + 1);
934 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 else
936 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200937 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->wstr == data)
940 printf("shared ");
941 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200942
Victor Stinnera3b334d2011-10-03 13:53:37 +0200943 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(" (%zu), ", compact->wstr_length);
945 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
946 printf("shared ");
947 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951#endif
952
953PyObject *
954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
955{
956 PyObject *obj;
957 PyCompactUnicodeObject *unicode;
958 void *data;
959 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200960 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 Py_ssize_t char_size;
962 Py_ssize_t struct_size;
963
964 /* Optimization for empty strings */
965 if (size == 0 && unicode_empty != NULL) {
966 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200967 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 }
969
970#ifdef Py_DEBUG
971 ++unicode_new_new_calls;
972#endif
973
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 is_ascii = 0;
975 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 struct_size = sizeof(PyCompactUnicodeObject);
977 if (maxchar < 128) {
978 kind_state = PyUnicode_1BYTE_KIND;
979 char_size = 1;
980 is_ascii = 1;
981 struct_size = sizeof(PyASCIIObject);
982 }
983 else if (maxchar < 256) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 }
987 else if (maxchar < 65536) {
988 kind_state = PyUnicode_2BYTE_KIND;
989 char_size = 2;
990 if (sizeof(wchar_t) == 2)
991 is_sharing = 1;
992 }
993 else {
994 kind_state = PyUnicode_4BYTE_KIND;
995 char_size = 4;
996 if (sizeof(wchar_t) == 4)
997 is_sharing = 1;
998 }
999
1000 /* Ensure we won't overflow the size. */
1001 if (size < 0) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "Negative size passed to PyUnicode_New");
1004 return NULL;
1005 }
1006 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1007 return PyErr_NoMemory();
1008
1009 /* Duplicated allocation code from _PyObject_New() instead of a call to
1010 * PyObject_New() so we are able to allocate space for the object and
1011 * it's data buffer.
1012 */
1013 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1014 if (obj == NULL)
1015 return PyErr_NoMemory();
1016 obj = PyObject_INIT(obj, &PyUnicode_Type);
1017 if (obj == NULL)
1018 return NULL;
1019
1020 unicode = (PyCompactUnicodeObject *)obj;
1021 if (is_ascii)
1022 data = ((PyASCIIObject*)obj) + 1;
1023 else
1024 data = unicode + 1;
1025 _PyUnicode_LENGTH(unicode) = size;
1026 _PyUnicode_HASH(unicode) = -1;
1027 _PyUnicode_STATE(unicode).interned = 0;
1028 _PyUnicode_STATE(unicode).kind = kind_state;
1029 _PyUnicode_STATE(unicode).compact = 1;
1030 _PyUnicode_STATE(unicode).ready = 1;
1031 _PyUnicode_STATE(unicode).ascii = is_ascii;
1032 if (is_ascii) {
1033 ((char*)data)[size] = 0;
1034 _PyUnicode_WSTR(unicode) = NULL;
1035 }
1036 else if (kind_state == PyUnicode_1BYTE_KIND) {
1037 ((char*)data)[size] = 0;
1038 _PyUnicode_WSTR(unicode) = NULL;
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001041 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 }
1043 else {
1044 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001045 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 if (kind_state == PyUnicode_2BYTE_KIND)
1047 ((Py_UCS2*)data)[size] = 0;
1048 else /* kind_state == PyUnicode_4BYTE_KIND */
1049 ((Py_UCS4*)data)[size] = 0;
1050 if (is_sharing) {
1051 _PyUnicode_WSTR_LENGTH(unicode) = size;
1052 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1053 }
1054 else {
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
1058 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001059 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return obj;
1061}
1062
1063#if SIZEOF_WCHAR_T == 2
1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1065 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001066 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067
1068 This function assumes that unicode can hold one more code point than wstr
1069 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001070static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001072 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073{
1074 const wchar_t *iter;
1075 Py_UCS4 *ucs4_out;
1076
Victor Stinner910337b2011-10-03 03:20:16 +02001077 assert(unicode != NULL);
1078 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1080 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1081
1082 for (iter = begin; iter < end; ) {
1083 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1084 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001085 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1086 && (iter+1) < end
1087 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 {
Victor Stinner551ac952011-11-29 22:58:13 +01001089 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 iter += 2;
1091 }
1092 else {
1093 *ucs4_out++ = *iter;
1094 iter++;
1095 }
1096 }
1097 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
1099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100}
1101#endif
1102
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103static int
1104_PyUnicode_Dirty(PyObject *unicode)
1105{
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001107 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001108 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001109 "Cannot modify a string having more than 1 reference");
1110 return -1;
1111 }
1112 _PyUnicode_DIRTY(unicode);
1113 return 0;
1114}
1115
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001116static int
1117_copy_characters(PyObject *to, Py_ssize_t to_start,
1118 PyObject *from, Py_ssize_t from_start,
1119 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 unsigned int from_kind, to_kind;
1122 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001123 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001125 assert(PyUnicode_Check(from));
1126 assert(PyUnicode_Check(to));
1127 assert(PyUnicode_IS_READY(from));
1128 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1131 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1132 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001134 if (how_many == 0)
1135 return 0;
1136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001140 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142#ifdef Py_DEBUG
1143 if (!check_maxchar
1144 && (from_kind > to_kind
1145 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1148 Py_UCS4 ch;
1149 Py_ssize_t i;
1150 for (i=0; i < how_many; i++) {
1151 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1152 assert(ch <= to_maxchar);
1153 }
1154 }
1155#endif
1156 fast = (from_kind == to_kind);
1157 if (check_maxchar
1158 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1159 {
1160 /* deny latin1 => ascii */
1161 fast = 0;
1162 }
1163
1164 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001165 Py_MEMCPY((char*)to_data + to_kind * to_start,
1166 (char*)from_data + from_kind * from_start,
1167 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 else if (from_kind == PyUnicode_1BYTE_KIND
1170 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001171 {
1172 _PyUnicode_CONVERT_BYTES(
1173 Py_UCS1, Py_UCS2,
1174 PyUnicode_1BYTE_DATA(from) + from_start,
1175 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1176 PyUnicode_2BYTE_DATA(to) + to_start
1177 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001178 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001180 && to_kind == PyUnicode_4BYTE_KIND)
1181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS4,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_4BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
1189 else if (from_kind == PyUnicode_2BYTE_KIND
1190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS2, Py_UCS4,
1194 PyUnicode_2BYTE_DATA(from) + from_start,
1195 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 /* check if max_char(from substring) <= max_char(to) */
1201 if (from_kind > to_kind
1202 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001203 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001204 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 /* slow path to check for character overflow */
1206 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 Py_ssize_t i;
1209
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 for (i=0; i < how_many; i++) {
1212 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001213 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#else
1217 if (!check_maxchar) {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
1222 }
1223 else {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 if (ch > to_maxchar)
1227 return 1;
1228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
1230 }
1231#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(0 && "inconsistent state");
1235 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 }
1237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 return 0;
1239}
1240
1241static void
1242copy_characters(PyObject *to, Py_ssize_t to_start,
1243 PyObject *from, Py_ssize_t from_start,
1244 Py_ssize_t how_many)
1245{
1246 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1247}
1248
1249Py_ssize_t
1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1251 PyObject *from, Py_ssize_t from_start,
1252 Py_ssize_t how_many)
1253{
1254 int err;
1255
1256 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1257 PyErr_BadInternalCall();
1258 return -1;
1259 }
1260
1261 if (PyUnicode_READY(from))
1262 return -1;
1263 if (PyUnicode_READY(to))
1264 return -1;
1265
1266 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1267 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1268 PyErr_Format(PyExc_SystemError,
1269 "Cannot write %zi characters at %zi "
1270 "in a string of %zi characters",
1271 how_many, to_start, PyUnicode_GET_LENGTH(to));
1272 return -1;
1273 }
1274
1275 if (how_many == 0)
1276 return 0;
1277
1278 if (_PyUnicode_Dirty(to))
1279 return -1;
1280
1281 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1282 if (err) {
1283 PyErr_Format(PyExc_SystemError,
1284 "Cannot copy %s characters "
1285 "into a string of %s characters",
1286 unicode_kind_name(from),
1287 unicode_kind_name(to));
1288 return -1;
1289 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001290 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291}
1292
Victor Stinner17222162011-09-28 22:15:37 +02001293/* Find the maximum code point and count the number of surrogate pairs so a
1294 correct string length can be computed before converting a string to UCS4.
1295 This function counts single surrogates as a character and not as a pair.
1296
1297 Return 0 on success, or -1 on error. */
1298static int
1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1300 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301{
1302 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001303 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304
Victor Stinnerc53be962011-10-02 21:33:54 +02001305 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 *num_surrogates = 0;
1307 *maxchar = 0;
1308
1309 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312 && (iter+1) < end
1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001315 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 iter += 2;
1318 }
1319 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 {
1322 ch = *iter;
1323 iter++;
1324 }
1325 if (ch > *maxchar) {
1326 *maxchar = ch;
1327 if (*maxchar > MAX_UNICODE) {
1328 PyErr_Format(PyExc_ValueError,
1329 "character U+%x is not in range [U+0000; U+10ffff]",
1330 ch);
1331 return -1;
1332 }
1333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 }
1335 return 0;
1336}
1337
1338#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001339static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001342int
1343_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 wchar_t *end;
1346 Py_UCS4 maxchar = 0;
1347 Py_ssize_t num_surrogates;
1348#if SIZEOF_WCHAR_T == 2
1349 Py_ssize_t length_wo_surrogates;
1350#endif
1351
Georg Brandl7597add2011-10-05 16:36:47 +02001352 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001353 strings were created using _PyObject_New() and where no canonical
1354 representation (the str field) has been set yet aka strings
1355 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001356 assert(_PyUnicode_CHECK(unicode));
1357 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001360 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001361 /* Actually, it should neither be interned nor be anything else: */
1362 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363
1364#ifdef Py_DEBUG
1365 ++unicode_ready_calls;
1366#endif
1367
1368 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001369 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001370 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372
1373 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1375 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 PyErr_NoMemory();
1377 return -1;
1378 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001379 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 _PyUnicode_WSTR(unicode), end,
1381 PyUnicode_1BYTE_DATA(unicode));
1382 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1383 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1384 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1385 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001386 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 }
1390 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8(unicode) = NULL;
1393 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 PyObject_FREE(_PyUnicode_WSTR(unicode));
1396 _PyUnicode_WSTR(unicode) = NULL;
1397 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398 }
1399 /* In this case we might have to convert down from 4-byte native
1400 wchar_t to 2-byte unicode. */
1401 else if (maxchar < 65536) {
1402 assert(num_surrogates == 0 &&
1403 "FindMaxCharAndNumSurrogatePairs() messed up");
1404
Victor Stinner506f5922011-09-28 22:34:18 +02001405#if SIZEOF_WCHAR_T == 2
1406 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001408 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001411 _PyUnicode_UTF8(unicode) = NULL;
1412 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001413#else
1414 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001416 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyErr_NoMemory();
1419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
Victor Stinner506f5922011-09-28 22:34:18 +02001421 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1422 _PyUnicode_WSTR(unicode), end,
1423 PyUnicode_2BYTE_DATA(unicode));
1424 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1425 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1426 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001427 _PyUnicode_UTF8(unicode) = NULL;
1428 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyObject_FREE(_PyUnicode_WSTR(unicode));
1430 _PyUnicode_WSTR(unicode) = NULL;
1431 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1432#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1435 else {
1436#if SIZEOF_WCHAR_T == 2
1437 /* in case the native representation is 2-bytes, we need to allocate a
1438 new normalized 4-byte version. */
1439 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1441 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 PyErr_NoMemory();
1443 return -1;
1444 }
1445 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1446 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001447 _PyUnicode_UTF8(unicode) = NULL;
1448 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001449 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1450 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001451 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyObject_FREE(_PyUnicode_WSTR(unicode));
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1455#else
1456 assert(num_surrogates == 0);
1457
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 _PyUnicode_UTF8(unicode) = NULL;
1461 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1463#endif
1464 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1465 }
1466 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001467 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 return 0;
1469}
1470
Alexander Belopolsky40018472011-02-26 01:02:56 +00001471static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473{
Walter Dörwald16807132007-05-25 13:52:07 +00001474 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 case SSTATE_NOT_INTERNED:
1476 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001477
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 case SSTATE_INTERNED_MORTAL:
1479 /* revive dead object temporarily for DelItem */
1480 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001481 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 Py_FatalError(
1483 "deletion of interned string failed");
1484 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001485
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 case SSTATE_INTERNED_IMMORTAL:
1487 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001488
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 default:
1490 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001491 }
1492
Victor Stinner03490912011-10-03 23:45:12 +02001493 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001495 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001496 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497
1498 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001499 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 }
1501 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 if (_PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001504 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
1506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (Py_REFCNT(unicode) != 1)
1529 return 0;
1530 if (PyUnicode_CHECK_INTERNED(unicode))
1531 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001532#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001533 /* singleton refcount is greater than 1 */
1534 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001535#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 return 1;
1537}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001538
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539static int
1540unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1541{
1542 PyObject *unicode;
1543 Py_ssize_t old_length;
1544
1545 assert(p_unicode != NULL);
1546 unicode = *p_unicode;
1547
1548 assert(unicode != NULL);
1549 assert(PyUnicode_Check(unicode));
1550 assert(0 <= length);
1551
Victor Stinner910337b2011-10-03 03:20:16 +02001552 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 old_length = PyUnicode_WSTR_LENGTH(unicode);
1554 else
1555 old_length = PyUnicode_GET_LENGTH(unicode);
1556 if (old_length == length)
1557 return 0;
1558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001559 if (length == 0) {
1560 Py_DECREF(*p_unicode);
1561 *p_unicode = unicode_empty;
1562 Py_INCREF(*p_unicode);
1563 return 0;
1564 }
1565
Victor Stinnerfe226c02011-10-03 03:52:20 +02001566 if (!unicode_resizable(unicode)) {
1567 PyObject *copy = resize_copy(unicode, length);
1568 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 Py_DECREF(*p_unicode);
1571 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001573 }
1574
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 if (PyUnicode_IS_COMPACT(unicode)) {
1576 *p_unicode = resize_compact(unicode, length);
1577 if (*p_unicode == NULL)
1578 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001579 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001581 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001582 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583}
1584
Alexander Belopolsky40018472011-02-26 01:02:56 +00001585int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001587{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *unicode;
1589 if (p_unicode == NULL) {
1590 PyErr_BadInternalCall();
1591 return -1;
1592 }
1593 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001594 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001600}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001603unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604{
1605 PyObject *result;
1606 assert(PyUnicode_IS_READY(*p_unicode));
1607 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1608 return 0;
1609 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1610 maxchar);
1611 if (result == NULL)
1612 return -1;
1613 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1614 PyUnicode_GET_LENGTH(*p_unicode));
1615 Py_DECREF(*p_unicode);
1616 *p_unicode = result;
1617 return 0;
1618}
1619
1620static int
1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1622 Py_UCS4 ch)
1623{
1624 if (unicode_widen(p_unicode, ch) < 0)
1625 return -1;
1626 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1627 PyUnicode_DATA(*p_unicode),
1628 (*pos)++, ch);
1629 return 0;
1630}
1631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632static PyObject*
1633get_latin1_char(unsigned char ch)
1634{
Victor Stinnera464fc12011-10-02 20:39:30 +02001635 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001637 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!unicode)
1639 return NULL;
1640 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001641 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 unicode_latin1[ch] = unicode;
1643 }
1644 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646}
1647
Alexander Belopolsky40018472011-02-26 01:02:56 +00001648PyObject *
1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001651 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 Py_UCS4 maxchar = 0;
1653 Py_ssize_t num_surrogates;
1654
1655 if (u == NULL)
1656 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 /* If the Unicode data is known at construction time, we can apply
1659 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 /* Optimization for empty strings */
1662 if (size == 0 && unicode_empty != NULL) {
1663 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001664 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001665 }
Tim Petersced69f82003-09-16 20:30:58 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Single character Unicode objects in the Latin-1 range are
1668 shared when using this constructor */
1669 if (size == 1 && *u < 256)
1670 return get_latin1_char((unsigned char)*u);
1671
1672 /* If not empty and not single character, copy the Unicode data
1673 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 if (find_maxchar_surrogates(u, u + size,
1675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return NULL;
1677
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!unicode)
1680 return NULL;
1681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 switch (PyUnicode_KIND(unicode)) {
1683 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688#if Py_UNICODE_SIZE == 2
1689 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1690#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001691 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1693#endif
1694 break;
1695 case PyUnicode_4BYTE_KIND:
1696#if SIZEOF_WCHAR_T == 2
1697 /* This is the only case which has to process surrogates, thus
1698 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001699 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700#else
1701 assert(num_surrogates == 0);
1702 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1703#endif
1704 break;
1705 default:
1706 assert(0 && "Impossible state");
1707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001709 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710}
1711
Alexander Belopolsky40018472011-02-26 01:02:56 +00001712PyObject *
1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001714{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 if (size < 0) {
1716 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 return NULL;
1719 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001720
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001721 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001722 some optimizations which share commonly used objects.
1723 Also, this means the input must be UTF-8, so fall back to the
1724 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001725 if (u != NULL) {
1726
Benjamin Peterson29060642009-01-31 22:14:21 +00001727 /* Optimization for empty strings */
1728 if (size == 0 && unicode_empty != NULL) {
1729 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001732
1733 /* Single characters are shared when using this constructor.
1734 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001735 if (size == 1 && (unsigned char)*u < 128)
1736 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001737
1738 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001739 }
1740
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001741 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001742}
1743
Alexander Belopolsky40018472011-02-26 01:02:56 +00001744PyObject *
1745PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001746{
1747 size_t size = strlen(u);
1748 if (size > PY_SSIZE_T_MAX) {
1749 PyErr_SetString(PyExc_OverflowError, "input too long");
1750 return NULL;
1751 }
1752
1753 return PyUnicode_FromStringAndSize(u, size);
1754}
1755
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001756PyObject *
1757_PyUnicode_FromId(_Py_Identifier *id)
1758{
1759 if (!id->object) {
1760 id->object = PyUnicode_FromString(id->string);
1761 if (!id->object)
1762 return NULL;
1763 PyUnicode_InternInPlace(&id->object);
1764 assert(!id->next);
1765 id->next = static_strings;
1766 static_strings = id;
1767 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001768 return id->object;
1769}
1770
1771void
1772_PyUnicode_ClearStaticStrings()
1773{
1774 _Py_Identifier *i;
1775 for (i = static_strings; i; i = i->next) {
1776 Py_DECREF(i->object);
1777 i->object = NULL;
1778 i->next = NULL;
1779 }
1780}
1781
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001782/* Internal function, don't check maximum character */
1783
Victor Stinnere57b1c02011-09-28 22:20:48 +02001784static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001785unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001786{
Victor Stinner785938e2011-12-11 20:09:03 +01001787 PyObject *unicode;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001788#ifdef Py_DEBUG
1789 const unsigned char *p;
1790 const unsigned char *end = s + size;
1791 for (p=s; p < end; p++) {
1792 assert(*p < 128);
1793 }
1794#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001795 if (size == 1)
1796 return get_latin1_char(s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01001797 unicode = PyUnicode_New(size, 127);
1798 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001799 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001800 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001803}
1804
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001805static Py_UCS4
1806kind_maxchar_limit(unsigned int kind)
1807{
1808 switch(kind) {
1809 case PyUnicode_1BYTE_KIND:
1810 return 0x80;
1811 case PyUnicode_2BYTE_KIND:
1812 return 0x100;
1813 case PyUnicode_4BYTE_KIND:
1814 return 0x10000;
1815 default:
1816 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001817 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001818 }
1819}
1820
Victor Stinner702c7342011-10-05 13:50:52 +02001821static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001822_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001825 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827 if (size == 0) {
1828 Py_INCREF(unicode_empty);
1829 return unicode_empty;
1830 }
1831 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001832 if (size == 1)
1833 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001835 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001836 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 if (!res)
1838 return NULL;
1839 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001840 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001842}
1843
Victor Stinnere57b1c02011-09-28 22:20:48 +02001844static PyObject*
1845_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846{
1847 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001848 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001849
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001850 if (size == 0) {
1851 Py_INCREF(unicode_empty);
1852 return unicode_empty;
1853 }
1854 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001855 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001856 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001858 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001859 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 if (!res)
1861 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001862 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001864 else {
1865 _PyUnicode_CONVERT_BYTES(
1866 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1867 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001868 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869 return res;
1870}
1871
Victor Stinnere57b1c02011-09-28 22:20:48 +02001872static PyObject*
1873_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874{
1875 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001877
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001878 if (size == 0) {
1879 Py_INCREF(unicode_empty);
1880 return unicode_empty;
1881 }
1882 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001883 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001884 return get_latin1_char((unsigned char)u[0]);
1885
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001886 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001887 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 if (!res)
1889 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001890 if (max_char < 256)
1891 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1892 PyUnicode_1BYTE_DATA(res));
1893 else if (max_char < 0x10000)
1894 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1895 PyUnicode_2BYTE_DATA(res));
1896 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001898 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 return res;
1900}
1901
1902PyObject*
1903PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1904{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001905 if (size < 0) {
1906 PyErr_SetString(PyExc_ValueError, "size must be positive");
1907 return NULL;
1908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 switch(kind) {
1910 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001911 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001913 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001917 PyErr_SetString(PyExc_SystemError, "invalid kind");
1918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920}
1921
Victor Stinner25a4b292011-10-06 12:31:55 +02001922/* Ensure that a string uses the most efficient storage, if it is not the
1923 case: create a new string with of the right kind. Write NULL into *p_unicode
1924 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001925static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001926unicode_adjust_maxchar(PyObject **p_unicode)
1927{
1928 PyObject *unicode, *copy;
1929 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001930 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001931 unsigned int kind;
1932
1933 assert(p_unicode != NULL);
1934 unicode = *p_unicode;
1935 assert(PyUnicode_IS_READY(unicode));
1936 if (PyUnicode_IS_ASCII(unicode))
1937 return;
1938
1939 len = PyUnicode_GET_LENGTH(unicode);
1940 kind = PyUnicode_KIND(unicode);
1941 if (kind == PyUnicode_1BYTE_KIND) {
1942 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 max_char = ucs1lib_find_max_char(u, u + len);
1944 if (max_char >= 128)
1945 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 }
1947 else if (kind == PyUnicode_2BYTE_KIND) {
1948 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 max_char = ucs2lib_find_max_char(u, u + len);
1950 if (max_char >= 256)
1951 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001952 }
1953 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001954 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001955 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 max_char = ucs4lib_find_max_char(u, u + len);
1957 if (max_char >= 0x10000)
1958 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001960 copy = PyUnicode_New(len, max_char);
1961 copy_characters(copy, 0, unicode, 0, len);
1962 Py_DECREF(unicode);
1963 *p_unicode = copy;
1964}
1965
Victor Stinner034f6cf2011-09-30 02:26:44 +02001966PyObject*
1967PyUnicode_Copy(PyObject *unicode)
1968{
Victor Stinner87af4f22011-11-21 23:03:47 +01001969 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001970 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001971
Victor Stinner034f6cf2011-09-30 02:26:44 +02001972 if (!PyUnicode_Check(unicode)) {
1973 PyErr_BadInternalCall();
1974 return NULL;
1975 }
1976 if (PyUnicode_READY(unicode))
1977 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001978
Victor Stinner87af4f22011-11-21 23:03:47 +01001979 length = PyUnicode_GET_LENGTH(unicode);
1980 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001981 if (!copy)
1982 return NULL;
1983 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1984
Victor Stinner87af4f22011-11-21 23:03:47 +01001985 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1986 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001987 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001988 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001989}
1990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992/* Widen Unicode objects to larger buffers. Don't write terminating null
1993 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994
1995void*
1996_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1997{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 Py_ssize_t len;
1999 void *result;
2000 unsigned int skind;
2001
2002 if (PyUnicode_READY(s))
2003 return NULL;
2004
2005 len = PyUnicode_GET_LENGTH(s);
2006 skind = PyUnicode_KIND(s);
2007 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002008 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return NULL;
2010 }
2011 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002012 case PyUnicode_2BYTE_KIND:
2013 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2014 if (!result)
2015 return PyErr_NoMemory();
2016 assert(skind == PyUnicode_1BYTE_KIND);
2017 _PyUnicode_CONVERT_BYTES(
2018 Py_UCS1, Py_UCS2,
2019 PyUnicode_1BYTE_DATA(s),
2020 PyUnicode_1BYTE_DATA(s) + len,
2021 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002023 case PyUnicode_4BYTE_KIND:
2024 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2025 if (!result)
2026 return PyErr_NoMemory();
2027 if (skind == PyUnicode_2BYTE_KIND) {
2028 _PyUnicode_CONVERT_BYTES(
2029 Py_UCS2, Py_UCS4,
2030 PyUnicode_2BYTE_DATA(s),
2031 PyUnicode_2BYTE_DATA(s) + len,
2032 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002034 else {
2035 assert(skind == PyUnicode_1BYTE_KIND);
2036 _PyUnicode_CONVERT_BYTES(
2037 Py_UCS1, Py_UCS4,
2038 PyUnicode_1BYTE_DATA(s),
2039 PyUnicode_1BYTE_DATA(s) + len,
2040 result);
2041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002043 default:
2044 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 }
Victor Stinner01698042011-10-04 00:04:26 +02002046 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 return NULL;
2048}
2049
2050static Py_UCS4*
2051as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2052 int copy_null)
2053{
2054 int kind;
2055 void *data;
2056 Py_ssize_t len, targetlen;
2057 if (PyUnicode_READY(string) == -1)
2058 return NULL;
2059 kind = PyUnicode_KIND(string);
2060 data = PyUnicode_DATA(string);
2061 len = PyUnicode_GET_LENGTH(string);
2062 targetlen = len;
2063 if (copy_null)
2064 targetlen++;
2065 if (!target) {
2066 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2067 PyErr_NoMemory();
2068 return NULL;
2069 }
2070 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2071 if (!target) {
2072 PyErr_NoMemory();
2073 return NULL;
2074 }
2075 }
2076 else {
2077 if (targetsize < targetlen) {
2078 PyErr_Format(PyExc_SystemError,
2079 "string is longer than the buffer");
2080 if (copy_null && 0 < targetsize)
2081 target[0] = 0;
2082 return NULL;
2083 }
2084 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002085 if (kind == PyUnicode_1BYTE_KIND) {
2086 Py_UCS1 *start = (Py_UCS1 *) data;
2087 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 else if (kind == PyUnicode_2BYTE_KIND) {
2090 Py_UCS2 *start = (Py_UCS2 *) data;
2091 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2092 }
2093 else {
2094 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (copy_null)
2098 target[len] = 0;
2099 return target;
2100}
2101
2102Py_UCS4*
2103PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2104 int copy_null)
2105{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002106 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 PyErr_BadInternalCall();
2108 return NULL;
2109 }
2110 return as_ucs4(string, target, targetsize, copy_null);
2111}
2112
2113Py_UCS4*
2114PyUnicode_AsUCS4Copy(PyObject *string)
2115{
2116 return as_ucs4(string, NULL, 0, 1);
2117}
2118
2119#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002120
Alexander Belopolsky40018472011-02-26 01:02:56 +00002121PyObject *
2122PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002125 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002127 PyErr_BadInternalCall();
2128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 }
2130
Martin v. Löwis790465f2008-04-05 20:41:37 +00002131 if (size == -1) {
2132 size = wcslen(w);
2133 }
2134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136}
2137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002139
Walter Dörwald346737f2007-05-31 10:44:43 +00002140static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002141makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2142 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002143{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002144 *fmt++ = '%';
2145 if (width) {
2146 if (zeropad)
2147 *fmt++ = '0';
2148 fmt += sprintf(fmt, "%d", width);
2149 }
2150 if (precision)
2151 fmt += sprintf(fmt, ".%d", precision);
2152 if (longflag)
2153 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002154 else if (longlongflag) {
2155 /* longlongflag should only ever be nonzero on machines with
2156 HAVE_LONG_LONG defined */
2157#ifdef HAVE_LONG_LONG
2158 char *f = PY_FORMAT_LONG_LONG;
2159 while (*f)
2160 *fmt++ = *f++;
2161#else
2162 /* we shouldn't ever get here */
2163 assert(0);
2164 *fmt++ = 'l';
2165#endif
2166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 else if (size_tflag) {
2168 char *f = PY_FORMAT_SIZE_T;
2169 while (*f)
2170 *fmt++ = *f++;
2171 }
2172 *fmt++ = c;
2173 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002174}
2175
Victor Stinner96865452011-03-01 23:44:09 +00002176/* helper for PyUnicode_FromFormatV() */
2177
2178static const char*
2179parse_format_flags(const char *f,
2180 int *p_width, int *p_precision,
2181 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2182{
2183 int width, precision, longflag, longlongflag, size_tflag;
2184
2185 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2186 f++;
2187 width = 0;
2188 while (Py_ISDIGIT((unsigned)*f))
2189 width = (width*10) + *f++ - '0';
2190 precision = 0;
2191 if (*f == '.') {
2192 f++;
2193 while (Py_ISDIGIT((unsigned)*f))
2194 precision = (precision*10) + *f++ - '0';
2195 if (*f == '%') {
2196 /* "%.3%s" => f points to "3" */
2197 f--;
2198 }
2199 }
2200 if (*f == '\0') {
2201 /* bogus format "%.1" => go backward, f points to "1" */
2202 f--;
2203 }
2204 if (p_width != NULL)
2205 *p_width = width;
2206 if (p_precision != NULL)
2207 *p_precision = precision;
2208
2209 /* Handle %ld, %lu, %lld and %llu. */
2210 longflag = 0;
2211 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002212 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002213
2214 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002215 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002216 longflag = 1;
2217 ++f;
2218 }
2219#ifdef HAVE_LONG_LONG
2220 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002221 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002222 longlongflag = 1;
2223 f += 2;
2224 }
2225#endif
2226 }
2227 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002228 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002229 size_tflag = 1;
2230 ++f;
2231 }
2232 if (p_longflag != NULL)
2233 *p_longflag = longflag;
2234 if (p_longlongflag != NULL)
2235 *p_longlongflag = longlongflag;
2236 if (p_size_tflag != NULL)
2237 *p_size_tflag = size_tflag;
2238 return f;
2239}
2240
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002241/* maximum number of characters required for output of %ld. 21 characters
2242 allows for 64-bit integers (in decimal) and an optional sign. */
2243#define MAX_LONG_CHARS 21
2244/* maximum number of characters required for output of %lld.
2245 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2246 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2247#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2248
Walter Dörwaldd2034312007-05-18 16:29:38 +00002249PyObject *
2250PyUnicode_FromFormatV(const char *format, va_list vargs)
2251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 va_list count;
2253 Py_ssize_t callcount = 0;
2254 PyObject **callresults = NULL;
2255 PyObject **callresult = NULL;
2256 Py_ssize_t n = 0;
2257 int width = 0;
2258 int precision = 0;
2259 int zeropad;
2260 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002261 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002262 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002263 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2265 Py_UCS4 argmaxchar;
2266 Py_ssize_t numbersize = 0;
2267 char *numberresults = NULL;
2268 char *numberresult = NULL;
2269 Py_ssize_t i;
2270 int kind;
2271 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002272
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002273 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002274 /* step 1: count the number of %S/%R/%A/%s format specifications
2275 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2276 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002278 * also estimate a upper bound for all the number formats in the string,
2279 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002281 for (f = format; *f; f++) {
2282 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002283 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2285 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2286 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2287 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002290#ifdef HAVE_LONG_LONG
2291 if (longlongflag) {
2292 if (width < MAX_LONG_LONG_CHARS)
2293 width = MAX_LONG_LONG_CHARS;
2294 }
2295 else
2296#endif
2297 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2298 including sign. Decimal takes the most space. This
2299 isn't enough for octal. If a width is specified we
2300 need more (which we allocate later). */
2301 if (width < MAX_LONG_CHARS)
2302 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303
2304 /* account for the size + '\0' to separate numbers
2305 inside of the numberresults buffer */
2306 numbersize += (width + 1);
2307 }
2308 }
2309 else if ((unsigned char)*f > 127) {
2310 PyErr_Format(PyExc_ValueError,
2311 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2312 "string, got a non-ASCII byte: 0x%02x",
2313 (unsigned char)*f);
2314 return NULL;
2315 }
2316 }
2317 /* step 2: allocate memory for the results of
2318 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2319 if (callcount) {
2320 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2321 if (!callresults) {
2322 PyErr_NoMemory();
2323 return NULL;
2324 }
2325 callresult = callresults;
2326 }
2327 /* step 2.5: allocate memory for the results of formating numbers */
2328 if (numbersize) {
2329 numberresults = PyObject_Malloc(numbersize);
2330 if (!numberresults) {
2331 PyErr_NoMemory();
2332 goto fail;
2333 }
2334 numberresult = numberresults;
2335 }
2336
2337 /* step 3: format numbers and figure out how large a buffer we need */
2338 for (f = format; *f; f++) {
2339 if (*f == '%') {
2340 const char* p;
2341 int longflag;
2342 int longlongflag;
2343 int size_tflag;
2344 int numprinted;
2345
2346 p = f;
2347 zeropad = (f[1] == '0');
2348 f = parse_format_flags(f, &width, &precision,
2349 &longflag, &longlongflag, &size_tflag);
2350 switch (*f) {
2351 case 'c':
2352 {
2353 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002354 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355 n++;
2356 break;
2357 }
2358 case '%':
2359 n++;
2360 break;
2361 case 'i':
2362 case 'd':
2363 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2364 width, precision, *f);
2365 if (longflag)
2366 numprinted = sprintf(numberresult, fmt,
2367 va_arg(count, long));
2368#ifdef HAVE_LONG_LONG
2369 else if (longlongflag)
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, PY_LONG_LONG));
2372#endif
2373 else if (size_tflag)
2374 numprinted = sprintf(numberresult, fmt,
2375 va_arg(count, Py_ssize_t));
2376 else
2377 numprinted = sprintf(numberresult, fmt,
2378 va_arg(count, int));
2379 n += numprinted;
2380 /* advance by +1 to skip over the '\0' */
2381 numberresult += (numprinted + 1);
2382 assert(*(numberresult - 1) == '\0');
2383 assert(*(numberresult - 2) != '\0');
2384 assert(numprinted >= 0);
2385 assert(numberresult <= numberresults + numbersize);
2386 break;
2387 case 'u':
2388 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2389 width, precision, 'u');
2390 if (longflag)
2391 numprinted = sprintf(numberresult, fmt,
2392 va_arg(count, unsigned long));
2393#ifdef HAVE_LONG_LONG
2394 else if (longlongflag)
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned PY_LONG_LONG));
2397#endif
2398 else if (size_tflag)
2399 numprinted = sprintf(numberresult, fmt,
2400 va_arg(count, size_t));
2401 else
2402 numprinted = sprintf(numberresult, fmt,
2403 va_arg(count, unsigned int));
2404 n += numprinted;
2405 numberresult += (numprinted + 1);
2406 assert(*(numberresult - 1) == '\0');
2407 assert(*(numberresult - 2) != '\0');
2408 assert(numprinted >= 0);
2409 assert(numberresult <= numberresults + numbersize);
2410 break;
2411 case 'x':
2412 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2413 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2414 n += numprinted;
2415 numberresult += (numprinted + 1);
2416 assert(*(numberresult - 1) == '\0');
2417 assert(*(numberresult - 2) != '\0');
2418 assert(numprinted >= 0);
2419 assert(numberresult <= numberresults + numbersize);
2420 break;
2421 case 'p':
2422 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2423 /* %p is ill-defined: ensure leading 0x. */
2424 if (numberresult[1] == 'X')
2425 numberresult[1] = 'x';
2426 else if (numberresult[1] != 'x') {
2427 memmove(numberresult + 2, numberresult,
2428 strlen(numberresult) + 1);
2429 numberresult[0] = '0';
2430 numberresult[1] = 'x';
2431 numprinted += 2;
2432 }
2433 n += numprinted;
2434 numberresult += (numprinted + 1);
2435 assert(*(numberresult - 1) == '\0');
2436 assert(*(numberresult - 2) != '\0');
2437 assert(numprinted >= 0);
2438 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002439 break;
2440 case 's':
2441 {
2442 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002443 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002444 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2445 if (!str)
2446 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 /* since PyUnicode_DecodeUTF8 returns already flexible
2448 unicode objects, there is no need to call ready on them */
2449 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002450 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002452 /* Remember the str and switch to the next slot */
2453 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 break;
2455 }
2456 case 'U':
2457 {
2458 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002459 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460 if (PyUnicode_READY(obj) == -1)
2461 goto fail;
2462 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002463 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 break;
2466 }
2467 case 'V':
2468 {
2469 PyObject *obj = va_arg(count, PyObject *);
2470 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002471 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002472 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002473 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002474 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 if (PyUnicode_READY(obj) == -1)
2476 goto fail;
2477 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002478 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002480 *callresult++ = NULL;
2481 }
2482 else {
2483 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2484 if (!str_obj)
2485 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002486 if (PyUnicode_READY(str_obj)) {
2487 Py_DECREF(str_obj);
2488 goto fail;
2489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002491 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002493 *callresult++ = str_obj;
2494 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002495 break;
2496 }
2497 case 'S':
2498 {
2499 PyObject *obj = va_arg(count, PyObject *);
2500 PyObject *str;
2501 assert(obj);
2502 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002506 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 /* Remember the str and switch to the next slot */
2509 *callresult++ = str;
2510 break;
2511 }
2512 case 'R':
2513 {
2514 PyObject *obj = va_arg(count, PyObject *);
2515 PyObject *repr;
2516 assert(obj);
2517 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002521 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 /* Remember the repr and switch to the next slot */
2524 *callresult++ = repr;
2525 break;
2526 }
2527 case 'A':
2528 {
2529 PyObject *obj = va_arg(count, PyObject *);
2530 PyObject *ascii;
2531 assert(obj);
2532 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002533 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002536 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 /* Remember the repr and switch to the next slot */
2539 *callresult++ = ascii;
2540 break;
2541 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 default:
2543 /* if we stumble upon an unknown
2544 formatting code, copy the rest of
2545 the format string to the output
2546 string. (we cannot just skip the
2547 code, since there's no way to know
2548 what's in the argument list) */
2549 n += strlen(p);
2550 goto expand;
2551 }
2552 } else
2553 n++;
2554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002555 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 we don't have to resize the string.
2559 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 if (!string)
2562 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 kind = PyUnicode_KIND(string);
2564 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002570 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002571
2572 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2574 /* checking for == because the last argument could be a empty
2575 string, which causes i to point to end, the assert at the end of
2576 the loop */
2577 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002578
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 switch (*f) {
2580 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002581 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002582 const int ordinal = va_arg(vargs, int);
2583 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002586 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 case 'p':
2591 /* unused, since we already have the result */
2592 if (*f == 'p')
2593 (void) va_arg(vargs, void *);
2594 else
2595 (void) va_arg(vargs, int);
2596 /* extract the result from numberresults and append. */
2597 for (; *numberresult; ++i, ++numberresult)
2598 PyUnicode_WRITE(kind, data, i, *numberresult);
2599 /* skip over the separating '\0' */
2600 assert(*numberresult == '\0');
2601 numberresult++;
2602 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 case 's':
2605 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 size = PyUnicode_GET_LENGTH(*callresult);
2610 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002611 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002613 /* We're done with the unicode()/repr() => forget it */
2614 Py_DECREF(*callresult);
2615 /* switch to next unicode()/repr() result */
2616 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 break;
2618 }
2619 case 'U':
2620 {
2621 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 Py_ssize_t size;
2623 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2624 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(obj);
2636 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002637 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 size = PyUnicode_GET_LENGTH(*callresult);
2641 assert(PyUnicode_KIND(*callresult) <=
2642 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002643 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002645 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 break;
2649 }
2650 case 'S':
2651 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002652 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002654 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* unused, since we already have the result */
2656 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 copy_characters(string, i, *callresult, 0, size);
2659 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* We're done with the unicode()/repr() => forget it */
2661 Py_DECREF(*callresult);
2662 /* switch to next unicode()/repr() result */
2663 ++callresult;
2664 break;
2665 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 break;
2669 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 for (; *p; ++p, ++i)
2671 PyUnicode_WRITE(kind, data, i, *p);
2672 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 goto end;
2674 }
Victor Stinner1205f272010-09-11 00:54:47 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 else {
2677 assert(i < PyUnicode_GET_LENGTH(string));
2678 PyUnicode_WRITE(kind, data, i++, *f);
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002682
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 if (callresults)
2685 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002688 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002689 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 if (callresults) {
2691 PyObject **callresult2 = callresults;
2692 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002693 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 ++callresult2;
2695 }
2696 PyObject_Free(callresults);
2697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 if (numberresults)
2699 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 if (buflen == -1)
2793 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002794 if (size != NULL)
2795 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 return buffer;
2797}
2798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Alexander Belopolsky40018472011-02-26 01:02:56 +00002801PyObject *
2802PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002805 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002806 PyErr_SetString(PyExc_ValueError,
2807 "chr() arg not in range(0x110000)");
2808 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 if (ordinal < 256)
2812 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 v = PyUnicode_New(1, ordinal);
2815 if (v == NULL)
2816 return NULL;
2817 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002818 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002820}
2821
Alexander Belopolsky40018472011-02-26 01:02:56 +00002822PyObject *
2823PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002828 if (PyUnicode_READY(obj))
2829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 Py_INCREF(obj);
2831 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
2833 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 /* For a Unicode subtype that's not a Unicode object,
2835 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002836 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002837 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002838 PyErr_Format(PyExc_TypeError,
2839 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002840 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002841 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002842}
2843
Alexander Belopolsky40018472011-02-26 01:02:56 +00002844PyObject *
2845PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002846 const char *encoding,
2847 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002849 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 PyErr_BadInternalCall();
2854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Decoding bytes objects is the most common case and should be fast */
2858 if (PyBytes_Check(obj)) {
2859 if (PyBytes_GET_SIZE(obj) == 0) {
2860 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002861 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002862 }
2863 else {
2864 v = PyUnicode_Decode(
2865 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2866 encoding, errors);
2867 }
2868 return v;
2869 }
2870
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002872 PyErr_SetString(PyExc_TypeError,
2873 "decoding str is not supported");
2874 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002875 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002876
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2878 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2879 PyErr_Format(PyExc_TypeError,
2880 "coercing to str: need bytes, bytearray "
2881 "or buffer-like object, %.80s found",
2882 Py_TYPE(obj)->tp_name);
2883 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002888 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 }
Tim Petersced69f82003-09-16 20:30:58 +00002890 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002892
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002894 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895}
2896
Victor Stinner600d3be2010-06-10 12:00:55 +00002897/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002898 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2899 1 on success. */
2900static int
2901normalize_encoding(const char *encoding,
2902 char *lower,
2903 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002905 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002906 char *l;
2907 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002908
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002909 if (encoding == NULL) {
2910 strcpy(lower, "utf-8");
2911 return 1;
2912 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002913 e = encoding;
2914 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002915 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002916 while (*e) {
2917 if (l == l_end)
2918 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002919 if (Py_ISUPPER(*e)) {
2920 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002921 }
2922 else if (*e == '_') {
2923 *l++ = '-';
2924 e++;
2925 }
2926 else {
2927 *l++ = *e++;
2928 }
2929 }
2930 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002931 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 Py_ssize_t size,
2937 const char *encoding,
2938 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002939{
2940 PyObject *buffer = NULL, *unicode;
2941 Py_buffer info;
2942 char lower[11]; /* Enough for any encoding shortcut */
2943
Fred Drakee4315f52000-05-09 19:53:39 +00002944 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002945 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002946 if ((strcmp(lower, "utf-8") == 0) ||
2947 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002948 return PyUnicode_DecodeUTF8(s, size, errors);
2949 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002951 (strcmp(lower, "iso-8859-1") == 0))
2952 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002953#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002954 else if (strcmp(lower, "mbcs") == 0)
2955 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002956#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002957 else if (strcmp(lower, "ascii") == 0)
2958 return PyUnicode_DecodeASCII(s, size, errors);
2959 else if (strcmp(lower, "utf-16") == 0)
2960 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2961 else if (strcmp(lower, "utf-32") == 0)
2962 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964
2965 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002966 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002967 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002969 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 if (buffer == NULL)
2971 goto onError;
2972 unicode = PyCodec_Decode(buffer, encoding, errors);
2973 if (unicode == NULL)
2974 goto onError;
2975 if (!PyUnicode_Check(unicode)) {
2976 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002977 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002978 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_DECREF(unicode);
2980 goto onError;
2981 }
2982 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002983 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002984
Benjamin Peterson29060642009-01-31 22:14:21 +00002985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 Py_XDECREF(buffer);
2987 return NULL;
2988}
2989
Alexander Belopolsky40018472011-02-26 01:02:56 +00002990PyObject *
2991PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002992 const char *encoding,
2993 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002994{
2995 PyObject *v;
2996
2997 if (!PyUnicode_Check(unicode)) {
2998 PyErr_BadArgument();
2999 goto onError;
3000 }
3001
3002 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003004
3005 /* Decode via the codec registry */
3006 v = PyCodec_Decode(unicode, encoding, errors);
3007 if (v == NULL)
3008 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003009 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
Benjamin Peterson29060642009-01-31 22:14:21 +00003011 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
3034 if (!PyUnicode_Check(v)) {
3035 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003036 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 Py_TYPE(v)->tp_name);
3038 Py_DECREF(v);
3039 goto onError;
3040 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 Py_ssize_t size,
3050 const char *encoding,
3051 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
3053 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003054
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 unicode = PyUnicode_FromUnicode(s, size);
3056 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3059 Py_DECREF(unicode);
3060 return v;
3061}
3062
Alexander Belopolsky40018472011-02-26 01:02:56 +00003063PyObject *
3064PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003065 const char *encoding,
3066 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003067{
3068 PyObject *v;
3069
3070 if (!PyUnicode_Check(unicode)) {
3071 PyErr_BadArgument();
3072 goto onError;
3073 }
3074
3075 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003076 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003077
3078 /* Encode via the codec registry */
3079 v = PyCodec_Encode(unicode, encoding, errors);
3080 if (v == NULL)
3081 goto onError;
3082 return v;
3083
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003085 return NULL;
3086}
3087
Victor Stinnerad158722010-10-27 00:25:46 +00003088PyObject *
3089PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003090{
Victor Stinner99b95382011-07-04 14:23:54 +02003091#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003092 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003093#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003094 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003095#else
Victor Stinner793b5312011-04-27 00:24:21 +02003096 PyInterpreterState *interp = PyThreadState_GET()->interp;
3097 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3098 cannot use it to encode and decode filenames before it is loaded. Load
3099 the Python codec requires to encode at least its own filename. Use the C
3100 version of the locale codec until the codec registry is initialized and
3101 the Python codec is loaded.
3102
3103 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3104 cannot only rely on it: check also interp->fscodec_initialized for
3105 subinterpreters. */
3106 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003107 return PyUnicode_AsEncodedString(unicode,
3108 Py_FileSystemDefaultEncoding,
3109 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003110 }
3111 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003112 /* locale encoding with surrogateescape */
3113 wchar_t *wchar;
3114 char *bytes;
3115 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003116 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003117
3118 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3119 if (wchar == NULL)
3120 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003121 bytes = _Py_wchar2char(wchar, &error_pos);
3122 if (bytes == NULL) {
3123 if (error_pos != (size_t)-1) {
3124 char *errmsg = strerror(errno);
3125 PyObject *exc = NULL;
3126 if (errmsg == NULL)
3127 errmsg = "Py_wchar2char() failed";
3128 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003129 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003130 error_pos, error_pos+1,
3131 errmsg);
3132 Py_XDECREF(exc);
3133 }
3134 else
3135 PyErr_NoMemory();
3136 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003137 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003138 }
3139 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003140
3141 bytes_obj = PyBytes_FromString(bytes);
3142 PyMem_Free(bytes);
3143 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003144 }
Victor Stinnerad158722010-10-27 00:25:46 +00003145#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003146}
3147
Alexander Belopolsky40018472011-02-26 01:02:56 +00003148PyObject *
3149PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003150 const char *encoding,
3151 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152{
3153 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003154 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003155
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 if (!PyUnicode_Check(unicode)) {
3157 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 }
Fred Drakee4315f52000-05-09 19:53:39 +00003160
Fred Drakee4315f52000-05-09 19:53:39 +00003161 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003162 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003163 if ((strcmp(lower, "utf-8") == 0) ||
3164 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003165 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003166 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003168 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003170 }
Victor Stinner37296e82010-06-10 13:36:23 +00003171 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003172 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003173 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003174 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003175#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003176 else if (strcmp(lower, "mbcs") == 0)
3177 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003178#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003179 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003180 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
3183 /* Encode via the codec registry */
3184 v = PyCodec_Encode(unicode, encoding, errors);
3185 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003186 return NULL;
3187
3188 /* The normal path */
3189 if (PyBytes_Check(v))
3190 return v;
3191
3192 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003193 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003194 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003195 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003196
3197 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3198 "encoder %s returned bytearray instead of bytes",
3199 encoding);
3200 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003201 Py_DECREF(v);
3202 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003205 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3206 Py_DECREF(v);
3207 return b;
3208 }
3209
3210 PyErr_Format(PyExc_TypeError,
3211 "encoder did not return a bytes object (type=%.400s)",
3212 Py_TYPE(v)->tp_name);
3213 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003214 return NULL;
3215}
3216
Alexander Belopolsky40018472011-02-26 01:02:56 +00003217PyObject *
3218PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003219 const char *encoding,
3220 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003221{
3222 PyObject *v;
3223
3224 if (!PyUnicode_Check(unicode)) {
3225 PyErr_BadArgument();
3226 goto onError;
3227 }
3228
3229 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003230 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003231
3232 /* Encode via the codec registry */
3233 v = PyCodec_Encode(unicode, encoding, errors);
3234 if (v == NULL)
3235 goto onError;
3236 if (!PyUnicode_Check(v)) {
3237 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003238 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003239 Py_TYPE(v)->tp_name);
3240 Py_DECREF(v);
3241 goto onError;
3242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003244
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 return NULL;
3247}
3248
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003249PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003250PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003251 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003252 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3253}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003254
Christian Heimes5894ba72007-11-04 11:43:14 +00003255PyObject*
3256PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3257{
Victor Stinner99b95382011-07-04 14:23:54 +02003258#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003259 return PyUnicode_DecodeMBCS(s, size, NULL);
3260#elif defined(__APPLE__)
3261 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3262#else
Victor Stinner793b5312011-04-27 00:24:21 +02003263 PyInterpreterState *interp = PyThreadState_GET()->interp;
3264 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3265 cannot use it to encode and decode filenames before it is loaded. Load
3266 the Python codec requires to encode at least its own filename. Use the C
3267 version of the locale codec until the codec registry is initialized and
3268 the Python codec is loaded.
3269
3270 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3271 cannot only rely on it: check also interp->fscodec_initialized for
3272 subinterpreters. */
3273 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003274 return PyUnicode_Decode(s, size,
3275 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003276 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003277 }
3278 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003279 /* locale encoding with surrogateescape */
3280 wchar_t *wchar;
3281 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003282 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283
3284 if (s[size] != '\0' || size != strlen(s)) {
3285 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3286 return NULL;
3287 }
3288
Victor Stinner168e1172010-10-16 23:16:16 +00003289 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003290 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003291 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003292
Victor Stinner168e1172010-10-16 23:16:16 +00003293 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003294 PyMem_Free(wchar);
3295 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003296 }
Victor Stinnerad158722010-10-27 00:25:46 +00003297#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003298}
3299
Martin v. Löwis011e8422009-05-05 04:43:17 +00003300
3301int
3302PyUnicode_FSConverter(PyObject* arg, void* addr)
3303{
3304 PyObject *output = NULL;
3305 Py_ssize_t size;
3306 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003307 if (arg == NULL) {
3308 Py_DECREF(*(PyObject**)addr);
3309 return 1;
3310 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003311 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003312 output = arg;
3313 Py_INCREF(output);
3314 }
3315 else {
3316 arg = PyUnicode_FromObject(arg);
3317 if (!arg)
3318 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003319 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003320 Py_DECREF(arg);
3321 if (!output)
3322 return 0;
3323 if (!PyBytes_Check(output)) {
3324 Py_DECREF(output);
3325 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3326 return 0;
3327 }
3328 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003329 size = PyBytes_GET_SIZE(output);
3330 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003331 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003332 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003333 Py_DECREF(output);
3334 return 0;
3335 }
3336 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003337 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003338}
3339
3340
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003341int
3342PyUnicode_FSDecoder(PyObject* arg, void* addr)
3343{
3344 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345 if (arg == NULL) {
3346 Py_DECREF(*(PyObject**)addr);
3347 return 1;
3348 }
3349 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 if (PyUnicode_READY(arg))
3351 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003352 output = arg;
3353 Py_INCREF(output);
3354 }
3355 else {
3356 arg = PyBytes_FromObject(arg);
3357 if (!arg)
3358 return 0;
3359 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3360 PyBytes_GET_SIZE(arg));
3361 Py_DECREF(arg);
3362 if (!output)
3363 return 0;
3364 if (!PyUnicode_Check(output)) {
3365 Py_DECREF(output);
3366 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3367 return 0;
3368 }
3369 }
Victor Stinner065836e2011-10-27 01:56:33 +02003370 if (PyUnicode_READY(output) < 0) {
3371 Py_DECREF(output);
3372 return 0;
3373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003375 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003376 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3377 Py_DECREF(output);
3378 return 0;
3379 }
3380 *(PyObject**)addr = output;
3381 return Py_CLEANUP_SUPPORTED;
3382}
3383
3384
Martin v. Löwis5b222132007-06-10 09:51:05 +00003385char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003386PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003387{
Christian Heimesf3863112007-11-22 07:46:41 +00003388 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003390 if (!PyUnicode_Check(unicode)) {
3391 PyErr_BadArgument();
3392 return NULL;
3393 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003394 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003395 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003396
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003397 if (PyUnicode_UTF8(unicode) == NULL) {
3398 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3400 if (bytes == NULL)
3401 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003402 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3403 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003404 Py_DECREF(bytes);
3405 return NULL;
3406 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003407 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3408 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3409 PyBytes_AS_STRING(bytes),
3410 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003411 Py_DECREF(bytes);
3412 }
3413
3414 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003415 *psize = PyUnicode_UTF8_LENGTH(unicode);
3416 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003417}
3418
3419char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003422 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3423}
3424
3425#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003426static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003427#endif
3428
3429
3430Py_UNICODE *
3431PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003433 const unsigned char *one_byte;
3434#if SIZEOF_WCHAR_T == 4
3435 const Py_UCS2 *two_bytes;
3436#else
3437 const Py_UCS4 *four_bytes;
3438 const Py_UCS4 *ucs4_end;
3439 Py_ssize_t num_surrogates;
3440#endif
3441 wchar_t *w;
3442 wchar_t *wchar_end;
3443
3444 if (!PyUnicode_Check(unicode)) {
3445 PyErr_BadArgument();
3446 return NULL;
3447 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003448 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003449 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 assert(_PyUnicode_KIND(unicode) != 0);
3451 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003452
3453#ifdef Py_DEBUG
3454 ++unicode_as_unicode_calls;
3455#endif
3456
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003457 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003459 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3460 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 num_surrogates = 0;
3462
3463 for (; four_bytes < ucs4_end; ++four_bytes) {
3464 if (*four_bytes > 0xFFFF)
3465 ++num_surrogates;
3466 }
3467
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003468 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3469 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3470 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471 PyErr_NoMemory();
3472 return NULL;
3473 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003474 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003476 w = _PyUnicode_WSTR(unicode);
3477 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3478 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3480 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003481 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003483 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3484 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 }
3486 else
3487 *w = *four_bytes;
3488
3489 if (w > wchar_end) {
3490 assert(0 && "Miscalculated string end");
3491 }
3492 }
3493 *w = 0;
3494#else
3495 /* sizeof(wchar_t) == 4 */
3496 Py_FatalError("Impossible unicode object state, wstr and str "
3497 "should share memory already.");
3498 return NULL;
3499#endif
3500 }
3501 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003502 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3503 (_PyUnicode_LENGTH(unicode) + 1));
3504 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 PyErr_NoMemory();
3506 return NULL;
3507 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3509 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3510 w = _PyUnicode_WSTR(unicode);
3511 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003513 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3514 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515 for (; w < wchar_end; ++one_byte, ++w)
3516 *w = *one_byte;
3517 /* null-terminate the wstr */
3518 *w = 0;
3519 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003520 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003522 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 for (; w < wchar_end; ++two_bytes, ++w)
3524 *w = *two_bytes;
3525 /* null-terminate the wstr */
3526 *w = 0;
3527#else
3528 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003529 PyObject_FREE(_PyUnicode_WSTR(unicode));
3530 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 Py_FatalError("Impossible unicode object state, wstr "
3532 "and str should share memory already.");
3533 return NULL;
3534#endif
3535 }
3536 else {
3537 assert(0 && "This should never happen.");
3538 }
3539 }
3540 }
3541 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003542 *size = PyUnicode_WSTR_LENGTH(unicode);
3543 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003544}
3545
Alexander Belopolsky40018472011-02-26 01:02:56 +00003546Py_UNICODE *
3547PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003549 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550}
3551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552
Alexander Belopolsky40018472011-02-26 01:02:56 +00003553Py_ssize_t
3554PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
3556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
3558 goto onError;
3559 }
3560 return PyUnicode_GET_SIZE(unicode);
3561
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 return -1;
3564}
3565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566Py_ssize_t
3567PyUnicode_GetLength(PyObject *unicode)
3568{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003569 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003570 PyErr_BadArgument();
3571 return -1;
3572 }
3573
3574 return PyUnicode_GET_LENGTH(unicode);
3575}
3576
3577Py_UCS4
3578PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3579{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003580 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3581 PyErr_BadArgument();
3582 return (Py_UCS4)-1;
3583 }
3584 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3585 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003586 return (Py_UCS4)-1;
3587 }
3588 return PyUnicode_READ_CHAR(unicode, index);
3589}
3590
3591int
3592PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3593{
3594 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003595 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003596 return -1;
3597 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003598 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3599 PyErr_SetString(PyExc_IndexError, "string index out of range");
3600 return -1;
3601 }
3602 if (_PyUnicode_Dirty(unicode))
3603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3605 index, ch);
3606 return 0;
3607}
3608
Alexander Belopolsky40018472011-02-26 01:02:56 +00003609const char *
3610PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003611{
Victor Stinner42cb4622010-09-01 19:39:01 +00003612 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003613}
3614
Victor Stinner554f3f02010-06-16 23:33:54 +00003615/* create or adjust a UnicodeDecodeError */
3616static void
3617make_decode_exception(PyObject **exceptionObject,
3618 const char *encoding,
3619 const char *input, Py_ssize_t length,
3620 Py_ssize_t startpos, Py_ssize_t endpos,
3621 const char *reason)
3622{
3623 if (*exceptionObject == NULL) {
3624 *exceptionObject = PyUnicodeDecodeError_Create(
3625 encoding, input, length, startpos, endpos, reason);
3626 }
3627 else {
3628 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3629 goto onError;
3630 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3631 goto onError;
3632 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3633 goto onError;
3634 }
3635 return;
3636
3637onError:
3638 Py_DECREF(*exceptionObject);
3639 *exceptionObject = NULL;
3640}
3641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642/* error handling callback helper:
3643 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003644 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 and adjust various state variables.
3646 return 0 on success, -1 on error
3647*/
3648
Alexander Belopolsky40018472011-02-26 01:02:56 +00003649static int
3650unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003651 const char *encoding, const char *reason,
3652 const char **input, const char **inend, Py_ssize_t *startinpos,
3653 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003654 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003656 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657
3658 PyObject *restuple = NULL;
3659 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003660 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t requiredsize;
3663 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 int res = -1;
3666
Victor Stinner596a6c42011-11-09 00:02:18 +01003667 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3668 outsize = PyUnicode_GET_LENGTH(*output);
3669 else
3670 outsize = _PyUnicode_WSTR_LENGTH(*output);
3671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 *errorHandler = PyCodec_LookupError(errors);
3674 if (*errorHandler == NULL)
3675 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677
Victor Stinner554f3f02010-06-16 23:33:54 +00003678 make_decode_exception(exceptionObject,
3679 encoding,
3680 *input, *inend - *input,
3681 *startinpos, *endinpos,
3682 reason);
3683 if (*exceptionObject == NULL)
3684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685
3686 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3687 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003690 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 }
3693 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003695 if (PyUnicode_READY(repunicode) < 0)
3696 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003697
3698 /* Copy back the bytes variables, which might have been modified by the
3699 callback */
3700 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3701 if (!inputobj)
3702 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003703 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003705 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 *input = PyBytes_AS_STRING(inputobj);
3707 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003709 /* we can DECREF safely, as the exception has another reference,
3710 so the object won't go away. */
3711 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003715 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3717 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719
Victor Stinner596a6c42011-11-09 00:02:18 +01003720 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3721 /* need more space? (at least enough for what we
3722 have+the replacement+the rest of the string (starting
3723 at the new input position), so we won't have to check space
3724 when there are no errors in the rest of the string) */
3725 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3726 requiredsize = *outpos + replen + insize-newpos;
3727 if (requiredsize > outsize) {
3728 if (requiredsize<2*outsize)
3729 requiredsize = 2*outsize;
3730 if (unicode_resize(output, requiredsize) < 0)
3731 goto onError;
3732 }
3733 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003735 copy_characters(*output, *outpos, repunicode, 0, replen);
3736 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 else {
3739 wchar_t *repwstr;
3740 Py_ssize_t repwlen;
3741 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3742 if (repwstr == NULL)
3743 goto onError;
3744 /* need more space? (at least enough for what we
3745 have+the replacement+the rest of the string (starting
3746 at the new input position), so we won't have to check space
3747 when there are no errors in the rest of the string) */
3748 requiredsize = *outpos + repwlen + insize-newpos;
3749 if (requiredsize > outsize) {
3750 if (requiredsize < 2*outsize)
3751 requiredsize = 2*outsize;
3752 if (unicode_resize(output, requiredsize) < 0)
3753 goto onError;
3754 }
3755 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3756 *outpos += repwlen;
3757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003759 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 /* we made it! */
3762 res = 0;
3763
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_XDECREF(restuple);
3766 return res;
3767}
3768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003769/* --- UTF-7 Codec -------------------------------------------------------- */
3770
Antoine Pitrou244651a2009-05-04 18:56:13 +00003771/* See RFC2152 for details. We encode conservatively and decode liberally. */
3772
3773/* Three simple macros defining base-64. */
3774
3775/* Is c a base-64 character? */
3776
3777#define IS_BASE64(c) \
3778 (((c) >= 'A' && (c) <= 'Z') || \
3779 ((c) >= 'a' && (c) <= 'z') || \
3780 ((c) >= '0' && (c) <= '9') || \
3781 (c) == '+' || (c) == '/')
3782
3783/* given that c is a base-64 character, what is its base-64 value? */
3784
3785#define FROM_BASE64(c) \
3786 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3787 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3788 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3789 (c) == '+' ? 62 : 63)
3790
3791/* What is the base-64 character of the bottom 6 bits of n? */
3792
3793#define TO_BASE64(n) \
3794 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3795
3796/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3797 * decoded as itself. We are permissive on decoding; the only ASCII
3798 * byte not decoding to itself is the + which begins a base64
3799 * string. */
3800
3801#define DECODE_DIRECT(c) \
3802 ((c) <= 127 && (c) != '+')
3803
3804/* The UTF-7 encoder treats ASCII characters differently according to
3805 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3806 * the above). See RFC2152. This array identifies these different
3807 * sets:
3808 * 0 : "Set D"
3809 * alphanumeric and '(),-./:?
3810 * 1 : "Set O"
3811 * !"#$%&*;<=>@[]^_`{|}
3812 * 2 : "whitespace"
3813 * ht nl cr sp
3814 * 3 : special (must be base64 encoded)
3815 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3816 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Tim Petersced69f82003-09-16 20:30:58 +00003818static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819char utf7_category[128] = {
3820/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3821 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3822/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3823 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3824/* sp ! " # $ % & ' ( ) * + , - . / */
3825 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3826/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3828/* @ A B C D E F G H I J K L M N O */
3829 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3830/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3832/* ` a b c d e f g h i j k l m n o */
3833 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3834/* p q r s t u v w x y z { | } ~ del */
3835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836};
3837
Antoine Pitrou244651a2009-05-04 18:56:13 +00003838/* ENCODE_DIRECT: this character should be encoded as itself. The
3839 * answer depends on whether we are encoding set O as itself, and also
3840 * on whether we are encoding whitespace as itself. RFC2152 makes it
3841 * clear that the answers to these questions vary between
3842 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003843
Antoine Pitrou244651a2009-05-04 18:56:13 +00003844#define ENCODE_DIRECT(c, directO, directWS) \
3845 ((c) < 128 && (c) > 0 && \
3846 ((utf7_category[(c)] == 0) || \
3847 (directWS && (utf7_category[(c)] == 2)) || \
3848 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Alexander Belopolsky40018472011-02-26 01:02:56 +00003850PyObject *
3851PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003852 Py_ssize_t size,
3853 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003855 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3856}
3857
Antoine Pitrou244651a2009-05-04 18:56:13 +00003858/* The decoder. The only state we preserve is our read position,
3859 * i.e. how many characters we have consumed. So if we end in the
3860 * middle of a shift sequence we have to back off the read position
3861 * and the output to the beginning of the sequence, otherwise we lose
3862 * all the shift state (seen bits, number of bits seen, high
3863 * surrogate). */
3864
Alexander Belopolsky40018472011-02-26 01:02:56 +00003865PyObject *
3866PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003867 Py_ssize_t size,
3868 const char *errors,
3869 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003872 Py_ssize_t startinpos;
3873 Py_ssize_t endinpos;
3874 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003876 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 const char *errmsg = "";
3878 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003880 unsigned int base64bits = 0;
3881 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003882 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 PyObject *errorHandler = NULL;
3884 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003885
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003886 /* Start off assuming it's all ASCII. Widen later as necessary. */
3887 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888 if (!unicode)
3889 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003890 if (size == 0) {
3891 if (consumed)
3892 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003893 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003896 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003897 e = s + size;
3898
3899 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003900 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003902 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003903
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 if (inShift) { /* in a base-64 section */
3905 if (IS_BASE64(ch)) { /* consume a base-64 character */
3906 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3907 base64bits += 6;
3908 s++;
3909 if (base64bits >= 16) {
3910 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003911 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 base64bits -= 16;
3913 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3914 if (surrogate) {
3915 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003916 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3917 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003918 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3919 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003921 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 }
3923 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003924 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3925 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 }
3928 }
Victor Stinner551ac952011-11-29 22:58:13 +01003929 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 /* first surrogate */
3931 surrogate = outCh;
3932 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003933 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003934 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3935 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 }
3937 }
3938 }
3939 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003940 inShift = 0;
3941 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003943 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3944 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003945 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 if (base64bits > 0) { /* left-over bits */
3948 if (base64bits >= 6) {
3949 /* We've seen at least one base-64 character */
3950 errmsg = "partial character in shift sequence";
3951 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003952 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 else {
3954 /* Some bits remain; they should be zero */
3955 if (base64buffer != 0) {
3956 errmsg = "non-zero padding bits in shift sequence";
3957 goto utf7Error;
3958 }
3959 }
3960 }
3961 if (ch != '-') {
3962 /* '-' is absorbed; other terminating
3963 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003964 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3965 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003966 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 }
3968 }
3969 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003971 s++; /* consume '+' */
3972 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003973 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003974 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3975 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 }
3977 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003979 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003981 }
3982 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003984 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3985 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 s++;
3987 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003988 else {
3989 startinpos = s-starts;
3990 s++;
3991 errmsg = "unexpected special character";
3992 goto utf7Error;
3993 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 endinpos = s-starts;
3997 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 errors, &errorHandler,
3999 "utf7", errmsg,
4000 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004001 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004003 }
4004
Antoine Pitrou244651a2009-05-04 18:56:13 +00004005 /* end of string */
4006
4007 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4008 /* if we're in an inconsistent state, that's an error */
4009 if (surrogate ||
4010 (base64bits >= 6) ||
4011 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 endinpos = size;
4013 if (unicode_decode_call_errorhandler(
4014 errors, &errorHandler,
4015 "utf7", "unterminated shift sequence",
4016 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004017 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018 goto onError;
4019 if (s < e)
4020 goto restart;
4021 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004022 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023
4024 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004025 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004027 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004028 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004029 }
4030 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004031 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004032 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004033 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004035 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036 goto onError;
4037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_XDECREF(errorHandler);
4039 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004040 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004041
Benjamin Peterson29060642009-01-31 22:14:21 +00004042 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004043 Py_XDECREF(errorHandler);
4044 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045 Py_DECREF(unicode);
4046 return NULL;
4047}
4048
4049
Alexander Belopolsky40018472011-02-26 01:02:56 +00004050PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004051_PyUnicode_EncodeUTF7(PyObject *str,
4052 int base64SetO,
4053 int base64WhiteSpace,
4054 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 int kind;
4057 void *data;
4058 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004059 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004061 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063 unsigned int base64bits = 0;
4064 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 char * out;
4066 char * start;
4067
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004068 if (PyUnicode_READY(str) < 0)
4069 return NULL;
4070 kind = PyUnicode_KIND(str);
4071 data = PyUnicode_DATA(str);
4072 len = PyUnicode_GET_LENGTH(str);
4073
4074 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004076
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004077 /* It might be possible to tighten this worst case */
4078 allocated = 8 * len;
4079 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004080 return PyErr_NoMemory();
4081
Antoine Pitrou244651a2009-05-04 18:56:13 +00004082 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083 if (v == NULL)
4084 return NULL;
4085
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004086 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004087 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004088 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004089
Antoine Pitrou244651a2009-05-04 18:56:13 +00004090 if (inShift) {
4091 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4092 /* shifting out */
4093 if (base64bits) { /* output remaining bits */
4094 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4095 base64buffer = 0;
4096 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004097 }
4098 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004099 /* Characters not in the BASE64 set implicitly unshift the sequence
4100 so no '-' is required, except if the character is itself a '-' */
4101 if (IS_BASE64(ch) || ch == '-') {
4102 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104 *out++ = (char) ch;
4105 }
4106 else {
4107 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004109 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004110 else { /* not in a shift sequence */
4111 if (ch == '+') {
4112 *out++ = '+';
4113 *out++ = '-';
4114 }
4115 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4116 *out++ = (char) ch;
4117 }
4118 else {
4119 *out++ = '+';
4120 inShift = 1;
4121 goto encode_char;
4122 }
4123 }
4124 continue;
4125encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004126 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004127 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004128
Antoine Pitrou244651a2009-05-04 18:56:13 +00004129 /* code first surrogate */
4130 base64bits += 16;
4131 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4132 while (base64bits >= 6) {
4133 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4134 base64bits -= 6;
4135 }
4136 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004137 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004139 base64bits += 16;
4140 base64buffer = (base64buffer << 16) | ch;
4141 while (base64bits >= 6) {
4142 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4143 base64bits -= 6;
4144 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004145 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004146 if (base64bits)
4147 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4148 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004149 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004150 if (_PyBytes_Resize(&v, out - start) < 0)
4151 return NULL;
4152 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004153}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004154PyObject *
4155PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4156 Py_ssize_t size,
4157 int base64SetO,
4158 int base64WhiteSpace,
4159 const char *errors)
4160{
4161 PyObject *result;
4162 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4163 if (tmp == NULL)
4164 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004165 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004166 base64WhiteSpace, errors);
4167 Py_DECREF(tmp);
4168 return result;
4169}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Antoine Pitrou244651a2009-05-04 18:56:13 +00004171#undef IS_BASE64
4172#undef FROM_BASE64
4173#undef TO_BASE64
4174#undef DECODE_DIRECT
4175#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177/* --- UTF-8 Codec -------------------------------------------------------- */
4178
Tim Petersced69f82003-09-16 20:30:58 +00004179static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004181 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4182 illegal prefix. See RFC 3629 for details */
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4195 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4197 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4198 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199};
4200
Alexander Belopolsky40018472011-02-26 01:02:56 +00004201PyObject *
4202PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004203 Py_ssize_t size,
4204 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205{
Walter Dörwald69652032004-09-07 20:24:22 +00004206 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4207}
4208
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004209#include "stringlib/ucs1lib.h"
4210#include "stringlib/codecs.h"
4211#include "stringlib/undef.h"
4212
4213#include "stringlib/ucs2lib.h"
4214#include "stringlib/codecs.h"
4215#include "stringlib/undef.h"
4216
4217#include "stringlib/ucs4lib.h"
4218#include "stringlib/codecs.h"
4219#include "stringlib/undef.h"
4220
Antoine Pitrouab868312009-01-10 15:40:25 +00004221/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4222#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4223
4224/* Mask to quickly check whether a C 'long' contains a
4225 non-ASCII, UTF8-encoded char. */
4226#if (SIZEOF_LONG == 8)
4227# define ASCII_CHAR_MASK 0x8080808080808080L
4228#elif (SIZEOF_LONG == 4)
4229# define ASCII_CHAR_MASK 0x80808080L
4230#else
4231# error C 'long' size should be either 4 or 8!
4232#endif
4233
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004234/* Scans a UTF-8 string and returns the maximum character to be expected
4235 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004237 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 */
4240static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004241utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4242 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245 const unsigned char *p = (const unsigned char *)s;
4246 const unsigned char *end = p + string_size;
4247 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004249 assert(unicode_size != NULL);
4250
4251 /* By having a cascade of independent loops which fallback onto each
4252 other, we minimize the amount of work done in the average loop
4253 iteration, and we also maximize the CPU's ability to predict
4254 branches correctly (because a given condition will have always the
4255 same boolean outcome except perhaps in the last iteration of the
4256 corresponding loop).
4257 In the general case this brings us rather close to decoding
4258 performance pre-PEP 393, despite the two-pass decoding.
4259
4260 Note that the pure ASCII loop is not duplicated once a non-ASCII
4261 character has been encountered. It is actually a pessimization (by
4262 a significant factor) to use this loop on text with many non-ASCII
4263 characters, and it is important to avoid bad performance on valid
4264 utf-8 data (invalid utf-8 being a different can of worms).
4265 */
4266
4267 /* ASCII */
4268 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004269 /* Only check value if it's not a ASCII char... */
4270 if (*p < 0x80) {
4271 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4272 an explanation. */
4273 if (!((size_t) p & LONG_PTR_MASK)) {
4274 /* Help register allocation */
4275 register const unsigned char *_p = p;
4276 while (_p < aligned_end) {
4277 unsigned long value = *(unsigned long *) _p;
4278 if (value & ASCII_CHAR_MASK)
4279 break;
4280 _p += SIZEOF_LONG;
4281 char_count += SIZEOF_LONG;
4282 }
4283 p = _p;
4284 if (p == end)
4285 break;
4286 }
4287 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004288 if (*p < 0x80)
4289 ++char_count;
4290 else
4291 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004292 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004293 *unicode_size = char_count;
4294 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004296_ucs1loop:
4297 for (; p < end; ++p) {
4298 if (*p < 0xc4)
4299 char_count += ((*p & 0xc0) != 0x80);
4300 else
4301 goto _ucs2loop;
4302 }
4303 *unicode_size = char_count;
4304 return 255;
4305
4306_ucs2loop:
4307 for (; p < end; ++p) {
4308 if (*p < 0xf0)
4309 char_count += ((*p & 0xc0) != 0x80);
4310 else
4311 goto _ucs4loop;
4312 }
4313 *unicode_size = char_count;
4314 return 65535;
4315
4316_ucs4loop:
4317 for (; p < end; ++p) {
4318 char_count += ((*p & 0xc0) != 0x80);
4319 }
4320 *unicode_size = char_count;
4321 return 65537;
4322}
4323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004325 in case of errors. Implicit parameters: unicode, kind, data, onError.
4326 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327*/
Victor Stinner785938e2011-12-11 20:09:03 +01004328#define WRITE_MAYBE_FAIL(index, value) \
4329 do { \
4330 Py_ssize_t pos = index; \
4331 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4332 unicode_resize(&unicode, pos + pos/8) < 0) \
4333 goto onError; \
4334 if (unicode_putchar(&unicode, &pos, value) < 0) \
4335 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 } while (0)
4337
Alexander Belopolsky40018472011-02-26 01:02:56 +00004338PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004339decode_utf8_errors(const char *starts,
4340 Py_ssize_t size,
4341 const char *errors,
4342 Py_ssize_t *consumed,
4343 const char *s,
4344 PyObject *unicode,
4345 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004346{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004348 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004349 Py_ssize_t startinpos;
4350 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004351 const char *e = starts + size;
4352 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004353 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 PyObject *errorHandler = NULL;
4355 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004356
Antoine Pitrouab868312009-01-10 15:40:25 +00004357 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358
4359 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004360 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004363 /* Fast path for runs of ASCII characters. Given that common UTF-8
4364 input will consist of an overwhelming majority of ASCII
4365 characters, we try to optimize for this case by checking
4366 as many characters as a C 'long' can contain.
4367 First, check if we can do an aligned read, as most CPUs have
4368 a penalty for unaligned reads.
4369 */
4370 if (!((size_t) s & LONG_PTR_MASK)) {
4371 /* Help register allocation */
4372 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004374 while (_s < aligned_end) {
4375 /* Read a whole long at a time (either 4 or 8 bytes),
4376 and do a fast unrolled copy if it only contains ASCII
4377 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004378 unsigned long value = *(unsigned long *) _s;
4379 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004380 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004381 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4382 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4383 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4384 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004385#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004386 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4387 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4388 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4389 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004390#endif
4391 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004392 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004393 }
4394 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004395 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004396 if (s == e)
4397 break;
4398 ch = (unsigned char)*s;
4399 }
4400 }
4401
4402 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 s++;
4405 continue;
4406 }
4407
4408 n = utf8_code_length[ch];
4409
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004410 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 if (consumed)
4412 break;
4413 else {
4414 errmsg = "unexpected end of data";
4415 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004416 endinpos = startinpos+1;
4417 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4418 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004419 goto utf8Error;
4420 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422
4423 switch (n) {
4424
4425 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004426 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 startinpos = s-starts;
4428 endinpos = startinpos+1;
4429 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430
4431 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004432 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 startinpos = s-starts;
4434 endinpos = startinpos+1;
4435 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436
4437 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004438 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004439 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 goto utf8Error;
4443 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004445 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004446 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 break;
4448
4449 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004450 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4451 will result in surrogates in range d800-dfff. Surrogates are
4452 not valid UTF-8 so they are rejected.
4453 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4454 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004455 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004456 (s[2] & 0xc0) != 0x80 ||
4457 ((unsigned char)s[0] == 0xE0 &&
4458 (unsigned char)s[1] < 0xA0) ||
4459 ((unsigned char)s[0] == 0xED &&
4460 (unsigned char)s[1] > 0x9F)) {
4461 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004463 endinpos = startinpos + 1;
4464
4465 /* if s[1] first two bits are 1 and 0, then the invalid
4466 continuation byte is s[2], so increment endinpos by 1,
4467 if not, s[1] is invalid and endinpos doesn't need to
4468 be incremented. */
4469 if ((s[1] & 0xC0) == 0x80)
4470 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 goto utf8Error;
4472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004474 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004476 break;
4477
4478 case 4:
4479 if ((s[1] & 0xc0) != 0x80 ||
4480 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004481 (s[3] & 0xc0) != 0x80 ||
4482 ((unsigned char)s[0] == 0xF0 &&
4483 (unsigned char)s[1] < 0x90) ||
4484 ((unsigned char)s[0] == 0xF4 &&
4485 (unsigned char)s[1] > 0x8F)) {
4486 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004488 endinpos = startinpos + 1;
4489 if ((s[1] & 0xC0) == 0x80) {
4490 endinpos++;
4491 if ((s[2] & 0xC0) == 0x80)
4492 endinpos++;
4493 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 goto utf8Error;
4495 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004496 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004497 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004498 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004499
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004500 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502 }
4503 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004505
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 if (unicode_decode_call_errorhandler(
4508 errors, &errorHandler,
4509 "utf8", errmsg,
4510 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004511 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513 /* Update data because unicode_decode_call_errorhandler might have
4514 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 }
Walter Dörwald69652032004-09-07 20:24:22 +00004517 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 /* Adjust length and ready string when it contained errors and
4521 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004522 if (unicode_resize(&unicode, i) < 0)
4523 goto onError;
4524 unicode_adjust_maxchar(&unicode);
4525 if (unicode == NULL)
4526 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528 Py_XDECREF(errorHandler);
4529 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004530 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004531 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 Py_XDECREF(errorHandler);
4535 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004536 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 return NULL;
4538}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004539#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004540
Victor Stinner785938e2011-12-11 20:09:03 +01004541PyObject *
4542PyUnicode_DecodeUTF8Stateful(const char *s,
4543 Py_ssize_t size,
4544 const char *errors,
4545 Py_ssize_t *consumed)
4546{
4547 Py_UCS4 maxchar = 0;
4548 Py_ssize_t unicode_size;
4549 int has_errors = 0;
4550 PyObject *unicode;
4551 int kind;
4552 void *data;
4553 const char *starts = s;
4554 const char *e;
4555 Py_ssize_t i;
4556
4557 if (size == 0) {
4558 if (consumed)
4559 *consumed = 0;
4560 return (PyObject *)PyUnicode_New(0, 0);
4561 }
4562
4563 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4564
4565 /* When the string is ASCII only, just use memcpy and return.
4566 unicode_size may be != size if there is an incomplete UTF-8
4567 sequence at the end of the ASCII block. */
4568 if (maxchar < 128 && size == unicode_size) {
4569 if (consumed)
4570 *consumed = size;
4571 return unicode_fromascii(s, size);
4572 }
4573
4574 unicode = PyUnicode_New(unicode_size, maxchar);
4575 if (!unicode)
4576 return NULL;
4577 kind = PyUnicode_KIND(unicode);
4578 data = PyUnicode_DATA(unicode);
4579
4580 /* Unpack UTF-8 encoded data */
4581 i = 0;
4582 e = starts + size;
4583 switch (kind) {
4584 case PyUnicode_1BYTE_KIND:
4585 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4586 break;
4587 case PyUnicode_2BYTE_KIND:
4588 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4589 break;
4590 case PyUnicode_4BYTE_KIND:
4591 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4592 break;
4593 }
4594 if (!has_errors) {
4595 /* Ensure the unicode size calculation was correct */
4596 assert(i == unicode_size);
4597 assert(s == e);
4598 if (consumed)
4599 *consumed = size;
4600 return unicode;
4601 }
4602
4603 /* In case of errors, maxchar and size computation might be incorrect;
4604 code below refits and resizes as necessary. */
4605 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4606}
4607
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004608#ifdef __APPLE__
4609
4610/* Simplified UTF-8 decoder using surrogateescape error handler,
4611 used to decode the command line arguments on Mac OS X. */
4612
4613wchar_t*
4614_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4615{
4616 int n;
4617 const char *e;
4618 wchar_t *unicode, *p;
4619
4620 /* Note: size will always be longer than the resulting Unicode
4621 character count */
4622 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4623 PyErr_NoMemory();
4624 return NULL;
4625 }
4626 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4627 if (!unicode)
4628 return NULL;
4629
4630 /* Unpack UTF-8 encoded data */
4631 p = unicode;
4632 e = s + size;
4633 while (s < e) {
4634 Py_UCS4 ch = (unsigned char)*s;
4635
4636 if (ch < 0x80) {
4637 *p++ = (wchar_t)ch;
4638 s++;
4639 continue;
4640 }
4641
4642 n = utf8_code_length[ch];
4643 if (s + n > e) {
4644 goto surrogateescape;
4645 }
4646
4647 switch (n) {
4648 case 0:
4649 case 1:
4650 goto surrogateescape;
4651
4652 case 2:
4653 if ((s[1] & 0xc0) != 0x80)
4654 goto surrogateescape;
4655 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4656 assert ((ch > 0x007F) && (ch <= 0x07FF));
4657 *p++ = (wchar_t)ch;
4658 break;
4659
4660 case 3:
4661 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4662 will result in surrogates in range d800-dfff. Surrogates are
4663 not valid UTF-8 so they are rejected.
4664 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4665 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4666 if ((s[1] & 0xc0) != 0x80 ||
4667 (s[2] & 0xc0) != 0x80 ||
4668 ((unsigned char)s[0] == 0xE0 &&
4669 (unsigned char)s[1] < 0xA0) ||
4670 ((unsigned char)s[0] == 0xED &&
4671 (unsigned char)s[1] > 0x9F)) {
4672
4673 goto surrogateescape;
4674 }
4675 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4676 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004678 break;
4679
4680 case 4:
4681 if ((s[1] & 0xc0) != 0x80 ||
4682 (s[2] & 0xc0) != 0x80 ||
4683 (s[3] & 0xc0) != 0x80 ||
4684 ((unsigned char)s[0] == 0xF0 &&
4685 (unsigned char)s[1] < 0x90) ||
4686 ((unsigned char)s[0] == 0xF4 &&
4687 (unsigned char)s[1] > 0x8F)) {
4688 goto surrogateescape;
4689 }
4690 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4691 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004692 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004693
4694#if SIZEOF_WCHAR_T == 4
4695 *p++ = (wchar_t)ch;
4696#else
4697 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004698 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4699 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004700#endif
4701 break;
4702 }
4703 s += n;
4704 continue;
4705
4706 surrogateescape:
4707 *p++ = 0xDC00 + ch;
4708 s++;
4709 }
4710 *p = L'\0';
4711 return unicode;
4712}
4713
4714#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716/* Primary internal function which creates utf8 encoded bytes objects.
4717
4718 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004719 and allocate exactly as much space needed at the end. Else allocate the
4720 maximum possible needed (4 result bytes per Unicode character), and return
4721 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004722*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004723PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004724_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725{
Tim Peters602f7402002-04-27 18:03:26 +00004726#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004727
Guido van Rossum98297ee2007-11-06 21:34:58 +00004728 Py_ssize_t i; /* index into s of next input byte */
4729 PyObject *result; /* result string object */
4730 char *p; /* next free byte in output buffer */
4731 Py_ssize_t nallocated; /* number of result bytes allocated */
4732 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004733 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004734 PyObject *errorHandler = NULL;
4735 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004736 int kind;
4737 void *data;
4738 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004739 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741 if (!PyUnicode_Check(unicode)) {
4742 PyErr_BadArgument();
4743 return NULL;
4744 }
4745
4746 if (PyUnicode_READY(unicode) == -1)
4747 return NULL;
4748
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004749 if (PyUnicode_UTF8(unicode))
4750 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4751 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004752
4753 kind = PyUnicode_KIND(unicode);
4754 data = PyUnicode_DATA(unicode);
4755 size = PyUnicode_GET_LENGTH(unicode);
4756
Tim Peters602f7402002-04-27 18:03:26 +00004757 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
Tim Peters602f7402002-04-27 18:03:26 +00004759 if (size <= MAX_SHORT_UNICHARS) {
4760 /* Write into the stack buffer; nallocated can't overflow.
4761 * At the end, we'll allocate exactly as much heap space as it
4762 * turns out we need.
4763 */
4764 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004765 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004766 p = stackbuf;
4767 }
4768 else {
4769 /* Overallocate on the heap, and give the excess back at the end. */
4770 nallocated = size * 4;
4771 if (nallocated / 4 != size) /* overflow! */
4772 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004773 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004774 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004775 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004776 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004777 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004778
Tim Peters602f7402002-04-27 18:03:26 +00004779 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004781
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004782 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004783 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004785
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004787 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004788 *p++ = (char)(0xc0 | (ch >> 6));
4789 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004790 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792 Py_ssize_t repsize, k, startpos;
4793 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794 rep = unicode_encode_call_errorhandler(
4795 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004796 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 if (!rep)
4798 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 if (PyBytes_Check(rep))
4801 repsize = PyBytes_GET_SIZE(rep);
4802 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004803 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804
4805 if (repsize > 4) {
4806 Py_ssize_t offset;
4807
4808 if (result == NULL)
4809 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004810 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4814 /* integer overflow */
4815 PyErr_NoMemory();
4816 goto error;
4817 }
4818 nallocated += repsize - 4;
4819 if (result != NULL) {
4820 if (_PyBytes_Resize(&result, nallocated) < 0)
4821 goto error;
4822 } else {
4823 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004824 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004825 goto error;
4826 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4827 }
4828 p = PyBytes_AS_STRING(result) + offset;
4829 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 if (PyBytes_Check(rep)) {
4832 char *prep = PyBytes_AS_STRING(rep);
4833 for(k = repsize; k > 0; k--)
4834 *p++ = *prep++;
4835 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004836 enum PyUnicode_Kind repkind;
4837 void *repdata;
4838
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004839 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004840 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004841 repkind = PyUnicode_KIND(rep);
4842 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843
4844 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004845 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004847 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004848 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004849 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004851 goto error;
4852 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004853 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004854 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004855 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004856 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004857 } else if (ch < 0x10000) {
4858 *p++ = (char)(0xe0 | (ch >> 12));
4859 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4860 *p++ = (char)(0x80 | (ch & 0x3f));
4861 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004862 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004863 /* Encode UCS4 Unicode ordinals */
4864 *p++ = (char)(0xf0 | (ch >> 18));
4865 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4866 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4867 *p++ = (char)(0x80 | (ch & 0x3f));
4868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004870
Guido van Rossum98297ee2007-11-06 21:34:58 +00004871 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004872 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004873 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004874 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004875 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004876 }
4877 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004878 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004879 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004880 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004881 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004882 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004886 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004887 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004888 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004889 Py_XDECREF(errorHandler);
4890 Py_XDECREF(exc);
4891 Py_XDECREF(result);
4892 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004893
Tim Peters602f7402002-04-27 18:03:26 +00004894#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895}
4896
Alexander Belopolsky40018472011-02-26 01:02:56 +00004897PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4899 Py_ssize_t size,
4900 const char *errors)
4901{
4902 PyObject *v, *unicode;
4903
4904 unicode = PyUnicode_FromUnicode(s, size);
4905 if (unicode == NULL)
4906 return NULL;
4907 v = _PyUnicode_AsUTF8String(unicode, errors);
4908 Py_DECREF(unicode);
4909 return v;
4910}
4911
4912PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004913PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Walter Dörwald41980ca2007-08-16 21:55:45 +00004918/* --- UTF-32 Codec ------------------------------------------------------- */
4919
4920PyObject *
4921PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 Py_ssize_t size,
4923 const char *errors,
4924 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925{
4926 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4927}
4928
4929PyObject *
4930PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 Py_ssize_t size,
4932 const char *errors,
4933 int *byteorder,
4934 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935{
4936 const char *starts = s;
4937 Py_ssize_t startinpos;
4938 Py_ssize_t endinpos;
4939 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004940 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004941 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942 int bo = 0; /* assume native ordering by default */
4943 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944 /* Offsets from q for retrieving bytes in the right order. */
4945#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4946 int iorder[] = {0, 1, 2, 3};
4947#else
4948 int iorder[] = {3, 2, 1, 0};
4949#endif
4950 PyObject *errorHandler = NULL;
4951 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004952
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953 q = (unsigned char *)s;
4954 e = q + size;
4955
4956 if (byteorder)
4957 bo = *byteorder;
4958
4959 /* Check for BOM marks (U+FEFF) in the input and adjust current
4960 byte order setting accordingly. In native mode, the leading BOM
4961 mark is skipped, in all other modes, it is copied to the output
4962 stream as-is (giving a ZWNBSP character). */
4963 if (bo == 0) {
4964 if (size >= 4) {
4965 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004968 if (bom == 0x0000FEFF) {
4969 q += 4;
4970 bo = -1;
4971 }
4972 else if (bom == 0xFFFE0000) {
4973 q += 4;
4974 bo = 1;
4975 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 if (bom == 0x0000FEFF) {
4978 q += 4;
4979 bo = 1;
4980 }
4981 else if (bom == 0xFFFE0000) {
4982 q += 4;
4983 bo = -1;
4984 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987 }
4988
4989 if (bo == -1) {
4990 /* force LE */
4991 iorder[0] = 0;
4992 iorder[1] = 1;
4993 iorder[2] = 2;
4994 iorder[3] = 3;
4995 }
4996 else if (bo == 1) {
4997 /* force BE */
4998 iorder[0] = 3;
4999 iorder[1] = 2;
5000 iorder[2] = 1;
5001 iorder[3] = 0;
5002 }
5003
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005004 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005005 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005006 if (!unicode)
5007 return NULL;
5008 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005009 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005010 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005011
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 Py_UCS4 ch;
5014 /* remaining bytes at the end? (size should be divisible by 4) */
5015 if (e-q<4) {
5016 if (consumed)
5017 break;
5018 errmsg = "truncated data";
5019 startinpos = ((const char *)q)-starts;
5020 endinpos = ((const char *)e)-starts;
5021 goto utf32Error;
5022 /* The remaining input chars are ignored if the callback
5023 chooses to skip the input */
5024 }
5025 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5026 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 if (ch >= 0x110000)
5029 {
5030 errmsg = "codepoint not in range(0x110000)";
5031 startinpos = ((const char *)q)-starts;
5032 endinpos = startinpos+4;
5033 goto utf32Error;
5034 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005035 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5036 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 q += 4;
5038 continue;
5039 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 if (unicode_decode_call_errorhandler(
5041 errors, &errorHandler,
5042 "utf32", errmsg,
5043 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005044 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046 }
5047
5048 if (byteorder)
5049 *byteorder = bo;
5050
5051 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053
5054 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005055 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 goto onError;
5057
5058 Py_XDECREF(errorHandler);
5059 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005060 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 Py_DECREF(unicode);
5064 Py_XDECREF(errorHandler);
5065 Py_XDECREF(exc);
5066 return NULL;
5067}
5068
5069PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005070_PyUnicode_EncodeUTF32(PyObject *str,
5071 const char *errors,
5072 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005074 int kind;
5075 void *data;
5076 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005077 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005079 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 /* Offsets from p for storing byte pairs in the right order. */
5081#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5082 int iorder[] = {0, 1, 2, 3};
5083#else
5084 int iorder[] = {3, 2, 1, 0};
5085#endif
5086
Benjamin Peterson29060642009-01-31 22:14:21 +00005087#define STORECHAR(CH) \
5088 do { \
5089 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5090 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5091 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5092 p[iorder[0]] = (CH) & 0xff; \
5093 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 } while(0)
5095
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096 if (!PyUnicode_Check(str)) {
5097 PyErr_BadArgument();
5098 return NULL;
5099 }
5100 if (PyUnicode_READY(str) < 0)
5101 return NULL;
5102 kind = PyUnicode_KIND(str);
5103 data = PyUnicode_DATA(str);
5104 len = PyUnicode_GET_LENGTH(str);
5105
5106 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005107 bytesize = nsize * 4;
5108 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005110 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 if (v == NULL)
5112 return NULL;
5113
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005114 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005116 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005117 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005118 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119
5120 if (byteorder == -1) {
5121 /* force LE */
5122 iorder[0] = 0;
5123 iorder[1] = 1;
5124 iorder[2] = 2;
5125 iorder[3] = 3;
5126 }
5127 else if (byteorder == 1) {
5128 /* force BE */
5129 iorder[0] = 3;
5130 iorder[1] = 2;
5131 iorder[2] = 1;
5132 iorder[3] = 0;
5133 }
5134
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005135 for (i = 0; i < len; i++)
5136 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005137
5138 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005139 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140#undef STORECHAR
5141}
5142
Alexander Belopolsky40018472011-02-26 01:02:56 +00005143PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005144PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5145 Py_ssize_t size,
5146 const char *errors,
5147 int byteorder)
5148{
5149 PyObject *result;
5150 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5151 if (tmp == NULL)
5152 return NULL;
5153 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5154 Py_DECREF(tmp);
5155 return result;
5156}
5157
5158PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005159PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160{
Victor Stinnerb960b342011-11-20 19:12:52 +01005161 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005162}
5163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164/* --- UTF-16 Codec ------------------------------------------------------- */
5165
Tim Peters772747b2001-08-09 22:21:55 +00005166PyObject *
5167PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 Py_ssize_t size,
5169 const char *errors,
5170 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171{
Walter Dörwald69652032004-09-07 20:24:22 +00005172 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5173}
5174
Antoine Pitrouab868312009-01-10 15:40:25 +00005175/* Two masks for fast checking of whether a C 'long' may contain
5176 UTF16-encoded surrogate characters. This is an efficient heuristic,
5177 assuming that non-surrogate characters with a code point >= 0x8000 are
5178 rare in most input.
5179 FAST_CHAR_MASK is used when the input is in native byte ordering,
5180 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005181*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005182#if (SIZEOF_LONG == 8)
5183# define FAST_CHAR_MASK 0x8000800080008000L
5184# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5185#elif (SIZEOF_LONG == 4)
5186# define FAST_CHAR_MASK 0x80008000L
5187# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5188#else
5189# error C 'long' size should be either 4 or 8!
5190#endif
5191
Walter Dörwald69652032004-09-07 20:24:22 +00005192PyObject *
5193PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 Py_ssize_t size,
5195 const char *errors,
5196 int *byteorder,
5197 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005198{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005199 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005200 Py_ssize_t startinpos;
5201 Py_ssize_t endinpos;
5202 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005203 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005204 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005205 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005207 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005208 /* Offsets from q for retrieving byte pairs in the right order. */
5209#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5210 int ihi = 1, ilo = 0;
5211#else
5212 int ihi = 0, ilo = 1;
5213#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 /* Note: size will always be longer than the resulting Unicode
5218 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005219 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 if (!unicode)
5221 return NULL;
5222 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005223 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005224 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225
Tim Peters772747b2001-08-09 22:21:55 +00005226 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005227 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005230 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005232 /* Check for BOM marks (U+FEFF) in the input and adjust current
5233 byte order setting accordingly. In native mode, the leading BOM
5234 mark is skipped, in all other modes, it is copied to the output
5235 stream as-is (giving a ZWNBSP character). */
5236 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005237 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005238 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 if (bom == 0xFEFF) {
5241 q += 2;
5242 bo = -1;
5243 }
5244 else if (bom == 0xFFFE) {
5245 q += 2;
5246 bo = 1;
5247 }
Tim Petersced69f82003-09-16 20:30:58 +00005248#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 if (bom == 0xFEFF) {
5250 q += 2;
5251 bo = 1;
5252 }
5253 else if (bom == 0xFFFE) {
5254 q += 2;
5255 bo = -1;
5256 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Tim Peters772747b2001-08-09 22:21:55 +00005261 if (bo == -1) {
5262 /* force LE */
5263 ihi = 1;
5264 ilo = 0;
5265 }
5266 else if (bo == 1) {
5267 /* force BE */
5268 ihi = 0;
5269 ilo = 1;
5270 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005271#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5272 native_ordering = ilo < ihi;
5273#else
5274 native_ordering = ilo > ihi;
5275#endif
Tim Peters772747b2001-08-09 22:21:55 +00005276
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005278 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005279 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005280 /* First check for possible aligned read of a C 'long'. Unaligned
5281 reads are more expensive, better to defer to another iteration. */
5282 if (!((size_t) q & LONG_PTR_MASK)) {
5283 /* Fast path for runs of non-surrogate chars. */
5284 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005285 int kind = PyUnicode_KIND(unicode);
5286 void *data = PyUnicode_DATA(unicode);
5287 while (_q < aligned_end) {
5288 unsigned long block = * (unsigned long *) _q;
5289 unsigned short *pblock = (unsigned short*)&block;
5290 Py_UCS4 maxch;
5291 if (native_ordering) {
5292 /* Can use buffer directly */
5293 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005294 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005295 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005296 else {
5297 /* Need to byte-swap */
5298 unsigned char *_p = (unsigned char*)pblock;
5299 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005300 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005301 _p[0] = _q[1];
5302 _p[1] = _q[0];
5303 _p[2] = _q[3];
5304 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005305#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005306 _p[4] = _q[5];
5307 _p[5] = _q[4];
5308 _p[6] = _q[7];
5309 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005310#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005311 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005312 maxch = Py_MAX(pblock[0], pblock[1]);
5313#if SIZEOF_LONG == 8
5314 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5315#endif
5316 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5317 if (unicode_widen(&unicode, maxch) < 0)
5318 goto onError;
5319 kind = PyUnicode_KIND(unicode);
5320 data = PyUnicode_DATA(unicode);
5321 }
5322 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5323 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5324#if SIZEOF_LONG == 8
5325 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5326 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5327#endif
5328 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005329 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005330 q = _q;
5331 if (q >= e)
5332 break;
5333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005335
Benjamin Peterson14339b62009-01-31 16:36:08 +00005336 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337
Victor Stinner551ac952011-11-29 22:58:13 +01005338 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005339 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5340 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 continue;
5342 }
5343
5344 /* UTF-16 code pair: */
5345 if (q > e) {
5346 errmsg = "unexpected end of data";
5347 startinpos = (((const char *)q) - 2) - starts;
5348 endinpos = ((const char *)e) + 1 - starts;
5349 goto utf16Error;
5350 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005351 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5352 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005354 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005355 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005356 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005357 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005358 continue;
5359 }
5360 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005361 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 startinpos = (((const char *)q)-4)-starts;
5363 endinpos = startinpos+2;
5364 goto utf16Error;
5365 }
5366
Benjamin Peterson14339b62009-01-31 16:36:08 +00005367 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005368 errmsg = "illegal encoding";
5369 startinpos = (((const char *)q)-2)-starts;
5370 endinpos = startinpos+2;
5371 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005372
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005375 errors,
5376 &errorHandler,
5377 "utf16", errmsg,
5378 &starts,
5379 (const char **)&e,
5380 &startinpos,
5381 &endinpos,
5382 &exc,
5383 (const char **)&q,
5384 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005385 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005388 /* remaining byte at the end? (size should be even) */
5389 if (e == q) {
5390 if (!consumed) {
5391 errmsg = "truncated data";
5392 startinpos = ((const char *)q) - starts;
5393 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 if (unicode_decode_call_errorhandler(
5395 errors,
5396 &errorHandler,
5397 "utf16", errmsg,
5398 &starts,
5399 (const char **)&e,
5400 &startinpos,
5401 &endinpos,
5402 &exc,
5403 (const char **)&q,
5404 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005405 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005406 goto onError;
5407 /* The remaining input chars are ignored if the callback
5408 chooses to skip the input */
5409 }
5410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
5412 if (byteorder)
5413 *byteorder = bo;
5414
Walter Dörwald69652032004-09-07 20:24:22 +00005415 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005419 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 goto onError;
5421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005422 Py_XDECREF(errorHandler);
5423 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005424 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 Py_XDECREF(errorHandler);
5429 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 return NULL;
5431}
5432
Antoine Pitrouab868312009-01-10 15:40:25 +00005433#undef FAST_CHAR_MASK
5434#undef SWAPPED_FAST_CHAR_MASK
5435
Tim Peters772747b2001-08-09 22:21:55 +00005436PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005437_PyUnicode_EncodeUTF16(PyObject *str,
5438 const char *errors,
5439 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441 int kind;
5442 void *data;
5443 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005444 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005445 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005446 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005447 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005448 /* Offsets from p for storing byte pairs in the right order. */
5449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5450 int ihi = 1, ilo = 0;
5451#else
5452 int ihi = 0, ilo = 1;
5453#endif
5454
Benjamin Peterson29060642009-01-31 22:14:21 +00005455#define STORECHAR(CH) \
5456 do { \
5457 p[ihi] = ((CH) >> 8) & 0xff; \
5458 p[ilo] = (CH) & 0xff; \
5459 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005460 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005462 if (!PyUnicode_Check(str)) {
5463 PyErr_BadArgument();
5464 return NULL;
5465 }
5466 if (PyUnicode_READY(str) < 0)
5467 return NULL;
5468 kind = PyUnicode_KIND(str);
5469 data = PyUnicode_DATA(str);
5470 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005471
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472 pairs = 0;
5473 if (kind == PyUnicode_4BYTE_KIND)
5474 for (i = 0; i < len; i++)
5475 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5476 pairs++;
5477 /* 2 * (len + pairs + (byteorder == 0)) */
5478 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005479 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005480 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005481 bytesize = nsize * 2;
5482 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005484 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 if (v == NULL)
5486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005488 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005491 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005492 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005493
5494 if (byteorder == -1) {
5495 /* force LE */
5496 ihi = 1;
5497 ilo = 0;
5498 }
5499 else if (byteorder == 1) {
5500 /* force BE */
5501 ihi = 0;
5502 ilo = 1;
5503 }
5504
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005505 for (i = 0; i < len; i++) {
5506 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5507 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005509 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5510 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 }
Tim Peters772747b2001-08-09 22:21:55 +00005512 STORECHAR(ch);
5513 if (ch2)
5514 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005515 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005516
5517 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005518 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005519#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520}
5521
Alexander Belopolsky40018472011-02-26 01:02:56 +00005522PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5524 Py_ssize_t size,
5525 const char *errors,
5526 int byteorder)
5527{
5528 PyObject *result;
5529 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5530 if (tmp == NULL)
5531 return NULL;
5532 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5533 Py_DECREF(tmp);
5534 return result;
5535}
5536
5537PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005538PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005540 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541}
5542
5543/* --- Unicode Escape Codec ----------------------------------------------- */
5544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005545/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5546 if all the escapes in the string make it still a valid ASCII string.
5547 Returns -1 if any escapes were found which cause the string to
5548 pop out of ASCII range. Otherwise returns the length of the
5549 required buffer to hold the string.
5550 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005551static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005552length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5553{
5554 const unsigned char *p = (const unsigned char *)s;
5555 const unsigned char *end = p + size;
5556 Py_ssize_t length = 0;
5557
5558 if (size < 0)
5559 return -1;
5560
5561 for (; p < end; ++p) {
5562 if (*p > 127) {
5563 /* Non-ASCII */
5564 return -1;
5565 }
5566 else if (*p != '\\') {
5567 /* Normal character */
5568 ++length;
5569 }
5570 else {
5571 /* Backslash-escape, check next char */
5572 ++p;
5573 /* Escape sequence reaches till end of string or
5574 non-ASCII follow-up. */
5575 if (p >= end || *p > 127)
5576 return -1;
5577 switch (*p) {
5578 case '\n':
5579 /* backslash + \n result in zero characters */
5580 break;
5581 case '\\': case '\'': case '\"':
5582 case 'b': case 'f': case 't':
5583 case 'n': case 'r': case 'v': case 'a':
5584 ++length;
5585 break;
5586 case '0': case '1': case '2': case '3':
5587 case '4': case '5': case '6': case '7':
5588 case 'x': case 'u': case 'U': case 'N':
5589 /* these do not guarantee ASCII characters */
5590 return -1;
5591 default:
5592 /* count the backslash + the other character */
5593 length += 2;
5594 }
5595 }
5596 }
5597 return length;
5598}
5599
Fredrik Lundh06d12682001-01-24 07:59:11 +00005600static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005601
Alexander Belopolsky40018472011-02-26 01:02:56 +00005602PyObject *
5603PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005604 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005608 Py_ssize_t startinpos;
5609 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005611 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005613 char* message;
5614 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005615 PyObject *errorHandler = NULL;
5616 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005617 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005619
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621
5622 /* After length_of_escaped_ascii_string() there are two alternatives,
5623 either the string is pure ASCII with named escapes like \n, etc.
5624 and we determined it's exact size (common case)
5625 or it contains \x, \u, ... escape sequences. then we create a
5626 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005627 if (len >= 0) {
5628 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005629 if (!v)
5630 goto onError;
5631 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 }
5633 else {
5634 /* Escaped strings will always be longer than the resulting
5635 Unicode string, so we start with size here and then reduce the
5636 length after conversion to the true value.
5637 (but if the error callback returns a long replacement string
5638 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005639 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 if (!v)
5641 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 }
5644
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005646 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005647 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005649
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 while (s < end) {
5651 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005652 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 /* The only case in which i == ascii_length is a backslash
5656 followed by a newline. */
5657 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 /* Non-escape characters are interpreted as Unicode ordinals */
5660 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005661 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5662 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 continue;
5664 }
5665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 /* \ - Escapes */
5668 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005669 c = *s++;
5670 if (s > end)
5671 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005672
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005673 /* The only case in which i == ascii_length is a backslash
5674 followed by a newline. */
5675 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005677 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680#define WRITECHAR(ch) \
5681 do { \
5682 if (unicode_putchar(&v, &i, ch) < 0) \
5683 goto onError; \
5684 }while(0)
5685
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 case '\\': WRITECHAR('\\'); break;
5688 case '\'': WRITECHAR('\''); break;
5689 case '\"': WRITECHAR('\"'); break;
5690 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005692 case 'f': WRITECHAR('\014'); break;
5693 case 't': WRITECHAR('\t'); break;
5694 case 'n': WRITECHAR('\n'); break;
5695 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005697 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005698 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 case '0': case '1': case '2': case '3':
5703 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005704 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005705 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005706 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005707 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005708 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 break;
5712
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 /* hex escapes */
5714 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005716 digits = 2;
5717 message = "truncated \\xXX escape";
5718 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719
Benjamin Peterson29060642009-01-31 22:14:21 +00005720 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722 digits = 4;
5723 message = "truncated \\uXXXX escape";
5724 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005727 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005728 digits = 8;
5729 message = "truncated \\UXXXXXXXX escape";
5730 hexescape:
5731 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 if (s+digits>end) {
5733 endinpos = size;
5734 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 errors, &errorHandler,
5736 "unicodeescape", "end of string in escape sequence",
5737 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 goto onError;
5740 goto nextByte;
5741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005742 for (j = 0; j < digits; ++j) {
5743 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005744 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005746 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 errors, &errorHandler,
5748 "unicodeescape", message,
5749 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005750 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005751 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005752 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 }
5755 chr = (chr<<4) & ~0xF;
5756 if (c >= '0' && c <= '9')
5757 chr += c - '0';
5758 else if (c >= 'a' && c <= 'f')
5759 chr += 10 + c - 'a';
5760 else
5761 chr += 10 + c - 'A';
5762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005763 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005764 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 /* _decoding_error will have already written into the
5766 target buffer. */
5767 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005768 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005770 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 errors, &errorHandler,
5776 "unicodeescape", "illegal Unicode character",
5777 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 goto onError;
5780 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005781 break;
5782
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 case 'N':
5785 message = "malformed \\N character escape";
5786 if (ucnhash_CAPI == NULL) {
5787 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5789 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005790 if (ucnhash_CAPI == NULL)
5791 goto ucnhashError;
5792 }
5793 if (*s == '{') {
5794 const char *start = s+1;
5795 /* look for the closing brace */
5796 while (*s != '}' && s < end)
5797 s++;
5798 if (s > start && s < end && *s == '}') {
5799 /* found a name. look it up in the unicode database */
5800 message = "unknown Unicode character name";
5801 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005802 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005803 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005804 goto store;
5805 }
5806 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 errors, &errorHandler,
5810 "unicodeescape", message,
5811 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005814 break;
5815
5816 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005817 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 message = "\\ at end of string";
5819 s--;
5820 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 errors, &errorHandler,
5823 "unicodeescape", message,
5824 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005826 goto onError;
5827 }
5828 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005829 WRITECHAR('\\');
5830 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005831 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005832 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 if (PyUnicode_Resize(&v, i) < 0)
5840 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005841 Py_XDECREF(errorHandler);
5842 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005843 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005844
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005846 PyErr_SetString(
5847 PyExc_UnicodeError,
5848 "\\N escapes not supported (can't load unicodedata module)"
5849 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(errorHandler);
5852 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005853 return NULL;
5854
Benjamin Peterson29060642009-01-31 22:14:21 +00005855 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(errorHandler);
5858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 return NULL;
5860}
5861
5862/* Return a Unicode-Escape string version of the Unicode object.
5863
5864 If quotes is true, the string is enclosed in u"" or u'' quotes as
5865 appropriate.
5866
5867*/
5868
Alexander Belopolsky40018472011-02-26 01:02:56 +00005869PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005872 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005873 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 int kind;
5876 void *data;
5877 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Thomas Wouters89f507f2006-12-13 04:49:30 +00005879 /* Initial allocation is based on the longest-possible unichr
5880 escape.
5881
5882 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5883 unichr, so in this case it's the longest unichr escape. In
5884 narrow (UTF-16) builds this is five chars per source unichr
5885 since there are two unichrs in the surrogate pair, so in narrow
5886 (UTF-16) builds it's not the longest unichr escape.
5887
5888 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5889 so in the narrow (UTF-16) build case it's the longest unichr
5890 escape.
5891 */
5892
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 if (!PyUnicode_Check(unicode)) {
5894 PyErr_BadArgument();
5895 return NULL;
5896 }
5897 if (PyUnicode_READY(unicode) < 0)
5898 return NULL;
5899 len = PyUnicode_GET_LENGTH(unicode);
5900 kind = PyUnicode_KIND(unicode);
5901 data = PyUnicode_DATA(unicode);
5902 switch(kind) {
5903 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5904 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5905 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5906 }
5907
5908 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005909 return PyBytes_FromStringAndSize(NULL, 0);
5910
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005913
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005914 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 if (repr == NULL)
5919 return NULL;
5920
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005921 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005924 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005925
Walter Dörwald79e913e2007-05-12 11:08:06 +00005926 /* Escape backslashes */
5927 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 *p++ = '\\';
5929 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005930 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005931 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005932
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005933 /* Map 21-bit characters to '\U00xxxxxx' */
5934 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005935 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005936 *p++ = '\\';
5937 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005938 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5939 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5940 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5941 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5942 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5943 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5944 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5945 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005947 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005948
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005950 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 *p++ = '\\';
5952 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005953 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5954 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5955 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5956 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005958
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005959 /* Map special whitespace to '\t', \n', '\r' */
5960 else if (ch == '\t') {
5961 *p++ = '\\';
5962 *p++ = 't';
5963 }
5964 else if (ch == '\n') {
5965 *p++ = '\\';
5966 *p++ = 'n';
5967 }
5968 else if (ch == '\r') {
5969 *p++ = '\\';
5970 *p++ = 'r';
5971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005973 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005974 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005976 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005977 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5978 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005979 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 /* Copy everything else as-is */
5982 else
5983 *p++ = (char) ch;
5984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 assert(p - PyBytes_AS_STRING(repr) > 0);
5987 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5988 return NULL;
5989 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990}
5991
Alexander Belopolsky40018472011-02-26 01:02:56 +00005992PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005993PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5994 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005996 PyObject *result;
5997 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5998 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006000 result = PyUnicode_AsUnicodeEscapeString(tmp);
6001 Py_DECREF(tmp);
6002 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003}
6004
6005/* --- Raw Unicode Escape Codec ------------------------------------------- */
6006
Alexander Belopolsky40018472011-02-26 01:02:56 +00006007PyObject *
6008PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006009 Py_ssize_t size,
6010 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006013 Py_ssize_t startinpos;
6014 Py_ssize_t endinpos;
6015 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006016 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 const char *end;
6018 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006019 PyObject *errorHandler = NULL;
6020 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 /* Escaped strings will always be longer than the resulting
6023 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006024 length after conversion to the true value. (But decoding error
6025 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006026 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006030 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006031 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 end = s + size;
6033 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 unsigned char c;
6035 Py_UCS4 x;
6036 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006037 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* Non-escape characters are interpreted as Unicode ordinals */
6040 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006041 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6042 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006044 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 startinpos = s-starts;
6046
6047 /* \u-escapes are only interpreted iff the number of leading
6048 backslashes if odd */
6049 bs = s;
6050 for (;s < end;) {
6051 if (*s != '\\')
6052 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006053 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6054 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 }
6056 if (((s - bs) & 1) == 0 ||
6057 s >= end ||
6058 (*s != 'u' && *s != 'U')) {
6059 continue;
6060 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006061 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 count = *s=='u' ? 4 : 8;
6063 s++;
6064
6065 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 for (x = 0, i = 0; i < count; ++i, ++s) {
6067 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006068 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 endinpos = s-starts;
6070 if (unicode_decode_call_errorhandler(
6071 errors, &errorHandler,
6072 "rawunicodeescape", "truncated \\uXXXX",
6073 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006074 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 goto onError;
6076 goto nextByte;
6077 }
6078 x = (x<<4) & ~0xF;
6079 if (c >= '0' && c <= '9')
6080 x += c - '0';
6081 else if (c >= 'a' && c <= 'f')
6082 x += 10 + c - 'a';
6083 else
6084 x += 10 + c - 'A';
6085 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006086 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 if (unicode_putchar(&v, &outpos, x) < 0)
6088 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006089 } else {
6090 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006091 if (unicode_decode_call_errorhandler(
6092 errors, &errorHandler,
6093 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006097 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 nextByte:
6099 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006101 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 Py_XDECREF(errorHandler);
6104 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006105 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006106
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 Py_XDECREF(errorHandler);
6110 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 return NULL;
6112}
6113
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006116PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006118 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 char *p;
6120 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006121 Py_ssize_t expandsize, pos;
6122 int kind;
6123 void *data;
6124 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 if (!PyUnicode_Check(unicode)) {
6127 PyErr_BadArgument();
6128 return NULL;
6129 }
6130 if (PyUnicode_READY(unicode) < 0)
6131 return NULL;
6132 kind = PyUnicode_KIND(unicode);
6133 data = PyUnicode_DATA(unicode);
6134 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006135 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6136 bytes, and 1 byte characters 4. */
6137 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006138
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006139 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006141
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 if (repr == NULL)
6144 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006146 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006148 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149 for (pos = 0; pos < len; pos++) {
6150 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 /* Map 32-bit characters to '\Uxxxxxxxx' */
6152 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006153 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006154 *p++ = '\\';
6155 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006156 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6158 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6159 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6160 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6161 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6162 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6163 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006164 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 *p++ = '\\';
6168 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006169 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6170 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6171 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6172 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 /* Copy everything else as-is */
6175 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 *p++ = (char) ch;
6177 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 assert(p > q);
6180 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006181 return NULL;
6182 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183}
6184
Alexander Belopolsky40018472011-02-26 01:02:56 +00006185PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6187 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 PyObject *result;
6190 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6191 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006192 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006193 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6194 Py_DECREF(tmp);
6195 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196}
6197
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006198/* --- Unicode Internal Codec ------------------------------------------- */
6199
Alexander Belopolsky40018472011-02-26 01:02:56 +00006200PyObject *
6201_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006202 Py_ssize_t size,
6203 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006204{
6205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006206 Py_ssize_t startinpos;
6207 Py_ssize_t endinpos;
6208 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006209 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210 const char *end;
6211 const char *reason;
6212 PyObject *errorHandler = NULL;
6213 PyObject *exc = NULL;
6214
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006215 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006216 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006217 1))
6218 return NULL;
6219
Thomas Wouters89f507f2006-12-13 04:49:30 +00006220 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006221 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006224 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006225 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006227 end = s + size;
6228
6229 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006230 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006231 Py_UCS4 ch;
6232 /* We copy the raw representation one byte at a time because the
6233 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006234 ((char *) &uch)[0] = s[0];
6235 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006236#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006237 ((char *) &uch)[2] = s[2];
6238 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006239#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006240 ch = uch;
6241
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006242 /* We have to sanity check the raw data, otherwise doom looms for
6243 some malformed UCS-4 data. */
6244 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006245#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006246 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006247#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248 end-s < Py_UNICODE_SIZE
6249 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006251 startinpos = s - starts;
6252 if (end-s < Py_UNICODE_SIZE) {
6253 endinpos = end-starts;
6254 reason = "truncated input";
6255 }
6256 else {
6257 endinpos = s - starts + Py_UNICODE_SIZE;
6258 reason = "illegal code point (> 0x10FFFF)";
6259 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006260 if (unicode_decode_call_errorhandler(
6261 errors, &errorHandler,
6262 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006263 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006264 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006266 continue;
6267 }
6268
6269 s += Py_UNICODE_SIZE;
6270#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006271 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006272 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006273 Py_UNICODE uch2;
6274 ((char *) &uch2)[0] = s[0];
6275 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006276 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006277 {
Victor Stinner551ac952011-11-29 22:58:13 +01006278 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006279 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006280 }
6281 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006282#endif
6283
6284 if (unicode_putchar(&v, &outpos, ch) < 0)
6285 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 }
6287
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006288 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006289 goto onError;
6290 Py_XDECREF(errorHandler);
6291 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006292 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006293
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 Py_XDECREF(v);
6296 Py_XDECREF(errorHandler);
6297 Py_XDECREF(exc);
6298 return NULL;
6299}
6300
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301/* --- Latin-1 Codec ------------------------------------------------------ */
6302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
6304PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006305 Py_ssize_t size,
6306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006309 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310}
6311
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006312/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006313static void
6314make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006315 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006316 PyObject *unicode,
6317 Py_ssize_t startpos, Py_ssize_t endpos,
6318 const char *reason)
6319{
6320 if (*exceptionObject == NULL) {
6321 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006322 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006323 encoding, unicode, startpos, endpos, reason);
6324 }
6325 else {
6326 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6327 goto onError;
6328 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6329 goto onError;
6330 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6331 goto onError;
6332 return;
6333 onError:
6334 Py_DECREF(*exceptionObject);
6335 *exceptionObject = NULL;
6336 }
6337}
6338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006340static void
6341raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006342 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006343 PyObject *unicode,
6344 Py_ssize_t startpos, Py_ssize_t endpos,
6345 const char *reason)
6346{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006347 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006348 encoding, unicode, startpos, endpos, reason);
6349 if (*exceptionObject != NULL)
6350 PyCodec_StrictErrors(*exceptionObject);
6351}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352
6353/* error handling callback helper:
6354 build arguments, call the callback and check the arguments,
6355 put the result into newpos and return the replacement string, which
6356 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006357static PyObject *
6358unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006359 PyObject **errorHandler,
6360 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006362 Py_ssize_t startpos, Py_ssize_t endpos,
6363 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006365 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006366 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 PyObject *restuple;
6368 PyObject *resunicode;
6369
6370 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 }
6375
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006376 if (PyUnicode_READY(unicode) < 0)
6377 return NULL;
6378 len = PyUnicode_GET_LENGTH(unicode);
6379
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006380 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006381 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384
6385 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006390 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 Py_DECREF(restuple);
6392 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006394 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 &resunicode, newpos)) {
6396 Py_DECREF(restuple);
6397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006399 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6400 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6401 Py_DECREF(restuple);
6402 return NULL;
6403 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 *newpos = len + *newpos;
6406 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6408 Py_DECREF(restuple);
6409 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 Py_INCREF(resunicode);
6412 Py_DECREF(restuple);
6413 return resunicode;
6414}
6415
Alexander Belopolsky40018472011-02-26 01:02:56 +00006416static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006417unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006418 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006419 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006421 /* input state */
6422 Py_ssize_t pos=0, size;
6423 int kind;
6424 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 /* output object */
6426 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 /* pointer into the output */
6428 char *str;
6429 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006430 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006431 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6432 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 PyObject *errorHandler = NULL;
6434 PyObject *exc = NULL;
6435 /* the following variable is used for caching string comparisons
6436 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6437 int known_errorHandler = -1;
6438
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006439 if (PyUnicode_READY(unicode) < 0)
6440 return NULL;
6441 size = PyUnicode_GET_LENGTH(unicode);
6442 kind = PyUnicode_KIND(unicode);
6443 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 /* allocate enough for a simple encoding without
6445 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006446 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006447 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006448 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006450 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006451 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 ressize = size;
6453
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006454 while (pos < size) {
6455 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456
Benjamin Peterson29060642009-01-31 22:14:21 +00006457 /* can we encode this? */
6458 if (c<limit) {
6459 /* no overflow check, because we know that the space is enough */
6460 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 Py_ssize_t requiredsize;
6465 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006468 Py_ssize_t collstart = pos;
6469 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 ++collend;
6473 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6474 if (known_errorHandler==-1) {
6475 if ((errors==NULL) || (!strcmp(errors, "strict")))
6476 known_errorHandler = 1;
6477 else if (!strcmp(errors, "replace"))
6478 known_errorHandler = 2;
6479 else if (!strcmp(errors, "ignore"))
6480 known_errorHandler = 3;
6481 else if (!strcmp(errors, "xmlcharrefreplace"))
6482 known_errorHandler = 4;
6483 else
6484 known_errorHandler = 0;
6485 }
6486 switch (known_errorHandler) {
6487 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006488 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 goto onError;
6490 case 2: /* replace */
6491 while (collstart++<collend)
6492 *str++ = '?'; /* fall through */
6493 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 break;
6496 case 4: /* xmlcharrefreplace */
6497 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 /* determine replacement size */
6499 for (i = collstart, repsize = 0; i < collend; ++i) {
6500 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6501 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006509 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006513 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006514 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 if (requiredsize > ressize) {
6520 if (requiredsize<2*ressize)
6521 requiredsize = 2*ressize;
6522 if (_PyBytes_Resize(&res, requiredsize))
6523 goto onError;
6524 str = PyBytes_AS_STRING(res) + respos;
6525 ressize = requiredsize;
6526 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006527 /* generate replacement */
6528 for (i = collstart; i < collend; ++i) {
6529 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 break;
6533 default:
6534 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 encoding, reason, unicode, &exc,
6536 collstart, collend, &newpos);
6537 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6538 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006540 if (PyBytes_Check(repunicode)) {
6541 /* Directly copy bytes result to output. */
6542 repsize = PyBytes_Size(repunicode);
6543 if (repsize > 1) {
6544 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006545 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006546 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6547 Py_DECREF(repunicode);
6548 goto onError;
6549 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006550 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006551 ressize += repsize-1;
6552 }
6553 memcpy(str, PyBytes_AsString(repunicode), repsize);
6554 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006556 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006557 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006558 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 /* need more space? (at least enough for what we
6560 have+the replacement+the rest of the string, so
6561 we won't have to check space for encodable characters) */
6562 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006563 repsize = PyUnicode_GET_LENGTH(repunicode);
6564 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 if (requiredsize > ressize) {
6566 if (requiredsize<2*ressize)
6567 requiredsize = 2*ressize;
6568 if (_PyBytes_Resize(&res, requiredsize)) {
6569 Py_DECREF(repunicode);
6570 goto onError;
6571 }
6572 str = PyBytes_AS_STRING(res) + respos;
6573 ressize = requiredsize;
6574 }
6575 /* check if there is anything unencodable in the replacement
6576 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 for (i = 0; repsize-->0; ++i, ++str) {
6578 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006580 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006581 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 Py_DECREF(repunicode);
6583 goto onError;
6584 }
6585 *str = (char)c;
6586 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006587 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006588 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006589 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006590 }
6591 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006592 /* Resize if we allocated to much */
6593 size = str - PyBytes_AS_STRING(res);
6594 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006595 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006596 if (_PyBytes_Resize(&res, size) < 0)
6597 goto onError;
6598 }
6599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 Py_XDECREF(errorHandler);
6601 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006602 return res;
6603
6604 onError:
6605 Py_XDECREF(res);
6606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
6608 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006609}
6610
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612PyObject *
6613PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006614 Py_ssize_t size,
6615 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 PyObject *result;
6618 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6619 if (unicode == NULL)
6620 return NULL;
6621 result = unicode_encode_ucs1(unicode, errors, 256);
6622 Py_DECREF(unicode);
6623 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006627_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 PyErr_BadArgument();
6631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006633 if (PyUnicode_READY(unicode) == -1)
6634 return NULL;
6635 /* Fast path: if it is a one-byte string, construct
6636 bytes object directly. */
6637 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6638 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6639 PyUnicode_GET_LENGTH(unicode));
6640 /* Non-Latin-1 characters present. Defer to above function to
6641 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006642 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006643}
6644
6645PyObject*
6646PyUnicode_AsLatin1String(PyObject *unicode)
6647{
6648 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649}
6650
6651/* --- 7-bit ASCII Codec -------------------------------------------------- */
6652
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653PyObject *
6654PyUnicode_DecodeASCII(const char *s,
6655 Py_ssize_t size,
6656 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006659 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006660 int kind;
6661 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006662 Py_ssize_t startinpos;
6663 Py_ssize_t endinpos;
6664 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006666 int has_error;
6667 const unsigned char *p = (const unsigned char *)s;
6668 const unsigned char *end = p + size;
6669 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006670 PyObject *errorHandler = NULL;
6671 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006672
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006673 if (size == 0) {
6674 Py_INCREF(unicode_empty);
6675 return unicode_empty;
6676 }
6677
Guido van Rossumd57fd912000-03-10 22:53:23 +00006678 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006679 if (size == 1 && (unsigned char)s[0] < 128)
6680 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006681
Victor Stinner702c7342011-10-05 13:50:52 +02006682 has_error = 0;
6683 while (p < end && !has_error) {
6684 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6685 an explanation. */
6686 if (!((size_t) p & LONG_PTR_MASK)) {
6687 /* Help register allocation */
6688 register const unsigned char *_p = p;
6689 while (_p < aligned_end) {
6690 unsigned long value = *(unsigned long *) _p;
6691 if (value & ASCII_CHAR_MASK) {
6692 has_error = 1;
6693 break;
6694 }
6695 _p += SIZEOF_LONG;
6696 }
6697 if (_p == end)
6698 break;
6699 if (has_error)
6700 break;
6701 p = _p;
6702 }
6703 if (*p & 0x80) {
6704 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006705 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006706 }
6707 else {
6708 ++p;
6709 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006710 }
Victor Stinner702c7342011-10-05 13:50:52 +02006711 if (!has_error)
6712 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006713
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006714 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006718 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006719 kind = PyUnicode_KIND(v);
6720 data = PyUnicode_DATA(v);
6721 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 e = s + size;
6723 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 register unsigned char c = (unsigned char)*s;
6725 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006726 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 if (unicode_decode_call_errorhandler(
6733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006736 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006738 kind = PyUnicode_KIND(v);
6739 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006742 if (PyUnicode_Resize(&v, outpos) < 0)
6743 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 Py_XDECREF(errorHandler);
6745 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006746 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006747 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006748
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 Py_XDECREF(errorHandler);
6752 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 return NULL;
6754}
6755
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006756/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757PyObject *
6758PyUnicode_EncodeASCII(const Py_UNICODE *p,
6759 Py_ssize_t size,
6760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006762 PyObject *result;
6763 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6764 if (unicode == NULL)
6765 return NULL;
6766 result = unicode_encode_ucs1(unicode, errors, 128);
6767 Py_DECREF(unicode);
6768 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769}
6770
Alexander Belopolsky40018472011-02-26 01:02:56 +00006771PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006772_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
6774 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 PyErr_BadArgument();
6776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778 if (PyUnicode_READY(unicode) == -1)
6779 return NULL;
6780 /* Fast path: if it is an ASCII-only string, construct bytes object
6781 directly. Else defer to above function to raise the exception. */
6782 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6783 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6784 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006786}
6787
6788PyObject *
6789PyUnicode_AsASCIIString(PyObject *unicode)
6790{
6791 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792}
6793
Victor Stinner99b95382011-07-04 14:23:54 +02006794#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006795
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006796/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006797
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006798#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006799#define NEED_RETRY
6800#endif
6801
Victor Stinner3a50e702011-10-18 21:21:00 +02006802#ifndef WC_ERR_INVALID_CHARS
6803# define WC_ERR_INVALID_CHARS 0x0080
6804#endif
6805
6806static char*
6807code_page_name(UINT code_page, PyObject **obj)
6808{
6809 *obj = NULL;
6810 if (code_page == CP_ACP)
6811 return "mbcs";
6812 if (code_page == CP_UTF7)
6813 return "CP_UTF7";
6814 if (code_page == CP_UTF8)
6815 return "CP_UTF8";
6816
6817 *obj = PyBytes_FromFormat("cp%u", code_page);
6818 if (*obj == NULL)
6819 return NULL;
6820 return PyBytes_AS_STRING(*obj);
6821}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822
Alexander Belopolsky40018472011-02-26 01:02:56 +00006823static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006824is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825{
6826 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006827 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828
Victor Stinner3a50e702011-10-18 21:21:00 +02006829 if (!IsDBCSLeadByteEx(code_page, *curr))
6830 return 0;
6831
6832 prev = CharPrevExA(code_page, s, curr, 0);
6833 if (prev == curr)
6834 return 1;
6835 /* FIXME: This code is limited to "true" double-byte encodings,
6836 as it assumes an incomplete character consists of a single
6837 byte. */
6838 if (curr - prev == 2)
6839 return 1;
6840 if (!IsDBCSLeadByteEx(code_page, *prev))
6841 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842 return 0;
6843}
6844
Victor Stinner3a50e702011-10-18 21:21:00 +02006845static DWORD
6846decode_code_page_flags(UINT code_page)
6847{
6848 if (code_page == CP_UTF7) {
6849 /* The CP_UTF7 decoder only supports flags=0 */
6850 return 0;
6851 }
6852 else
6853 return MB_ERR_INVALID_CHARS;
6854}
6855
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 * Decode a byte string from a Windows code page into unicode object in strict
6858 * mode.
6859 *
6860 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6861 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006862 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006864decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006865 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 const char *in,
6867 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868{
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006870 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872
6873 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 assert(insize > 0);
6875 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6876 if (outsize <= 0)
6877 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878
6879 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006881 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006882 if (*v == NULL)
6883 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 }
6886 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006889 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 }
6893
6894 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6896 if (outsize <= 0)
6897 goto error;
6898 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006899
Victor Stinner3a50e702011-10-18 21:21:00 +02006900error:
6901 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6902 return -2;
6903 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006904 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905}
6906
Victor Stinner3a50e702011-10-18 21:21:00 +02006907/*
6908 * Decode a byte string from a code page into unicode object with an error
6909 * handler.
6910 *
6911 * Returns consumed size if succeed, or raise a WindowsError or
6912 * UnicodeDecodeError exception and returns -1 on error.
6913 */
6914static int
6915decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 PyObject **v,
6917 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 const char *errors)
6919{
6920 const char *startin = in;
6921 const char *endin = in + size;
6922 const DWORD flags = decode_code_page_flags(code_page);
6923 /* Ideally, we should get reason from FormatMessage. This is the Windows
6924 2000 English version of the message. */
6925 const char *reason = "No mapping for the Unicode character exists "
6926 "in the target code page.";
6927 /* each step cannot decode more than 1 character, but a character can be
6928 represented as a surrogate pair */
6929 wchar_t buffer[2], *startout, *out;
6930 int insize, outsize;
6931 PyObject *errorHandler = NULL;
6932 PyObject *exc = NULL;
6933 PyObject *encoding_obj = NULL;
6934 char *encoding;
6935 DWORD err;
6936 int ret = -1;
6937
6938 assert(size > 0);
6939
6940 encoding = code_page_name(code_page, &encoding_obj);
6941 if (encoding == NULL)
6942 return -1;
6943
6944 if (errors == NULL || strcmp(errors, "strict") == 0) {
6945 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6946 UnicodeDecodeError. */
6947 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6948 if (exc != NULL) {
6949 PyCodec_StrictErrors(exc);
6950 Py_CLEAR(exc);
6951 }
6952 goto error;
6953 }
6954
6955 if (*v == NULL) {
6956 /* Create unicode object */
6957 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6958 PyErr_NoMemory();
6959 goto error;
6960 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006961 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006962 if (*v == NULL)
6963 goto error;
6964 startout = PyUnicode_AS_UNICODE(*v);
6965 }
6966 else {
6967 /* Extend unicode object */
6968 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6969 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6970 PyErr_NoMemory();
6971 goto error;
6972 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006973 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006974 goto error;
6975 startout = PyUnicode_AS_UNICODE(*v) + n;
6976 }
6977
6978 /* Decode the byte string character per character */
6979 out = startout;
6980 while (in < endin)
6981 {
6982 /* Decode a character */
6983 insize = 1;
6984 do
6985 {
6986 outsize = MultiByteToWideChar(code_page, flags,
6987 in, insize,
6988 buffer, Py_ARRAY_LENGTH(buffer));
6989 if (outsize > 0)
6990 break;
6991 err = GetLastError();
6992 if (err != ERROR_NO_UNICODE_TRANSLATION
6993 && err != ERROR_INSUFFICIENT_BUFFER)
6994 {
6995 PyErr_SetFromWindowsErr(0);
6996 goto error;
6997 }
6998 insize++;
6999 }
7000 /* 4=maximum length of a UTF-8 sequence */
7001 while (insize <= 4 && (in + insize) <= endin);
7002
7003 if (outsize <= 0) {
7004 Py_ssize_t startinpos, endinpos, outpos;
7005
7006 startinpos = in - startin;
7007 endinpos = startinpos + 1;
7008 outpos = out - PyUnicode_AS_UNICODE(*v);
7009 if (unicode_decode_call_errorhandler(
7010 errors, &errorHandler,
7011 encoding, reason,
7012 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007013 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 {
7015 goto error;
7016 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007017 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007018 }
7019 else {
7020 in += insize;
7021 memcpy(out, buffer, outsize * sizeof(wchar_t));
7022 out += outsize;
7023 }
7024 }
7025
7026 /* write a NUL character at the end */
7027 *out = 0;
7028
7029 /* Extend unicode object */
7030 outsize = out - startout;
7031 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007032 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007034 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007035
7036error:
7037 Py_XDECREF(encoding_obj);
7038 Py_XDECREF(errorHandler);
7039 Py_XDECREF(exc);
7040 return ret;
7041}
7042
Victor Stinner3a50e702011-10-18 21:21:00 +02007043static PyObject *
7044decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 const char *s, Py_ssize_t size,
7046 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047{
Victor Stinner76a31a62011-11-04 00:05:13 +01007048 PyObject *v = NULL;
7049 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 if (code_page < 0) {
7052 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7053 return NULL;
7054 }
7055
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058
Victor Stinner76a31a62011-11-04 00:05:13 +01007059 do
7060 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 if (size > INT_MAX) {
7063 chunk_size = INT_MAX;
7064 final = 0;
7065 done = 0;
7066 }
7067 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007068#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 {
7070 chunk_size = (int)size;
7071 final = (consumed == NULL);
7072 done = 1;
7073 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 /* Skip trailing lead-byte unless 'final' is set */
7076 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7077 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078
Victor Stinner76a31a62011-11-04 00:05:13 +01007079 if (chunk_size == 0 && done) {
7080 if (v != NULL)
7081 break;
7082 Py_INCREF(unicode_empty);
7083 return unicode_empty;
7084 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner76a31a62011-11-04 00:05:13 +01007086
7087 converted = decode_code_page_strict(code_page, &v,
7088 s, chunk_size);
7089 if (converted == -2)
7090 converted = decode_code_page_errors(code_page, &v,
7091 s, chunk_size,
7092 errors);
7093 assert(converted != 0);
7094
7095 if (converted < 0) {
7096 Py_XDECREF(v);
7097 return NULL;
7098 }
7099
7100 if (consumed)
7101 *consumed += converted;
7102
7103 s += converted;
7104 size -= converted;
7105 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007106
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007107 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108}
7109
Alexander Belopolsky40018472011-02-26 01:02:56 +00007110PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007111PyUnicode_DecodeCodePageStateful(int code_page,
7112 const char *s,
7113 Py_ssize_t size,
7114 const char *errors,
7115 Py_ssize_t *consumed)
7116{
7117 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7118}
7119
7120PyObject *
7121PyUnicode_DecodeMBCSStateful(const char *s,
7122 Py_ssize_t size,
7123 const char *errors,
7124 Py_ssize_t *consumed)
7125{
7126 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7127}
7128
7129PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007130PyUnicode_DecodeMBCS(const char *s,
7131 Py_ssize_t size,
7132 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007133{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7135}
7136
Victor Stinner3a50e702011-10-18 21:21:00 +02007137static DWORD
7138encode_code_page_flags(UINT code_page, const char *errors)
7139{
7140 if (code_page == CP_UTF8) {
7141 if (winver.dwMajorVersion >= 6)
7142 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7143 and later */
7144 return WC_ERR_INVALID_CHARS;
7145 else
7146 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7147 return 0;
7148 }
7149 else if (code_page == CP_UTF7) {
7150 /* CP_UTF7 only supports flags=0 */
7151 return 0;
7152 }
7153 else {
7154 if (errors != NULL && strcmp(errors, "replace") == 0)
7155 return 0;
7156 else
7157 return WC_NO_BEST_FIT_CHARS;
7158 }
7159}
7160
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 * Encode a Unicode string to a Windows code page into a byte string in strict
7163 * mode.
7164 *
7165 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7166 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007168static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007169encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007170 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172{
Victor Stinner554f3f02010-06-16 23:33:54 +00007173 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 BOOL *pusedDefaultChar = &usedDefaultChar;
7175 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007176 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007177 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 const DWORD flags = encode_code_page_flags(code_page, NULL);
7180 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007181 /* Create a substring so that we can get the UTF-16 representation
7182 of just the slice under consideration. */
7183 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184
Martin v. Löwis3d325192011-11-04 18:23:06 +01007185 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007186
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007188 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007190 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007191
Victor Stinner2fc507f2011-11-04 20:06:39 +01007192 substring = PyUnicode_Substring(unicode, offset, offset+len);
7193 if (substring == NULL)
7194 return -1;
7195 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7196 if (p == NULL) {
7197 Py_DECREF(substring);
7198 return -1;
7199 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007201 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 outsize = WideCharToMultiByte(code_page, flags,
7203 p, size,
7204 NULL, 0,
7205 NULL, pusedDefaultChar);
7206 if (outsize <= 0)
7207 goto error;
7208 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 if (pusedDefaultChar && *pusedDefaultChar) {
7210 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007213
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 if (*outbytes == NULL) {
7218 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007222 }
7223 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 const Py_ssize_t n = PyBytes_Size(*outbytes);
7226 if (outsize > PY_SSIZE_T_MAX - n) {
7227 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007228 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007229 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7232 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007236 }
7237
7238 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 outsize = WideCharToMultiByte(code_page, flags,
7240 p, size,
7241 out, outsize,
7242 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007243 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 if (outsize <= 0)
7245 goto error;
7246 if (pusedDefaultChar && *pusedDefaultChar)
7247 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007249
Victor Stinner3a50e702011-10-18 21:21:00 +02007250error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007252 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7253 return -2;
7254 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007255 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007256}
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258/*
7259 * Encode a Unicode string to a Windows code page into a byte string using a
7260 * error handler.
7261 *
7262 * Returns consumed characters if succeed, or raise a WindowsError and returns
7263 * -1 on other error.
7264 */
7265static int
7266encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007267 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007268 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007269{
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007271 Py_ssize_t pos = unicode_offset;
7272 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 /* Ideally, we should get reason from FormatMessage. This is the Windows
7274 2000 English version of the message. */
7275 const char *reason = "invalid character";
7276 /* 4=maximum length of a UTF-8 sequence */
7277 char buffer[4];
7278 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7279 Py_ssize_t outsize;
7280 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 PyObject *errorHandler = NULL;
7282 PyObject *exc = NULL;
7283 PyObject *encoding_obj = NULL;
7284 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007285 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 PyObject *rep;
7287 int ret = -1;
7288
7289 assert(insize > 0);
7290
7291 encoding = code_page_name(code_page, &encoding_obj);
7292 if (encoding == NULL)
7293 return -1;
7294
7295 if (errors == NULL || strcmp(errors, "strict") == 0) {
7296 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7297 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007298 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 if (exc != NULL) {
7300 PyCodec_StrictErrors(exc);
7301 Py_DECREF(exc);
7302 }
7303 Py_XDECREF(encoding_obj);
7304 return -1;
7305 }
7306
7307 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7308 pusedDefaultChar = &usedDefaultChar;
7309 else
7310 pusedDefaultChar = NULL;
7311
7312 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7313 PyErr_NoMemory();
7314 goto error;
7315 }
7316 outsize = insize * Py_ARRAY_LENGTH(buffer);
7317
7318 if (*outbytes == NULL) {
7319 /* Create string object */
7320 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7321 if (*outbytes == NULL)
7322 goto error;
7323 out = PyBytes_AS_STRING(*outbytes);
7324 }
7325 else {
7326 /* Extend string object */
7327 Py_ssize_t n = PyBytes_Size(*outbytes);
7328 if (n > PY_SSIZE_T_MAX - outsize) {
7329 PyErr_NoMemory();
7330 goto error;
7331 }
7332 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7333 goto error;
7334 out = PyBytes_AS_STRING(*outbytes) + n;
7335 }
7336
7337 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007340 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7341 wchar_t chars[2];
7342 int charsize;
7343 if (ch < 0x10000) {
7344 chars[0] = (wchar_t)ch;
7345 charsize = 1;
7346 }
7347 else {
7348 ch -= 0x10000;
7349 chars[0] = 0xd800 + (ch >> 10);
7350 chars[1] = 0xdc00 + (ch & 0x3ff);
7351 charsize = 2;
7352 }
7353
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 buffer, Py_ARRAY_LENGTH(buffer),
7357 NULL, pusedDefaultChar);
7358 if (outsize > 0) {
7359 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7360 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007361 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 memcpy(out, buffer, outsize);
7363 out += outsize;
7364 continue;
7365 }
7366 }
7367 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7368 PyErr_SetFromWindowsErr(0);
7369 goto error;
7370 }
7371
Victor Stinner3a50e702011-10-18 21:21:00 +02007372 rep = unicode_encode_call_errorhandler(
7373 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007375 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 if (rep == NULL)
7377 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007378 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007379
7380 if (PyBytes_Check(rep)) {
7381 outsize = PyBytes_GET_SIZE(rep);
7382 if (outsize != 1) {
7383 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7384 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7385 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7386 Py_DECREF(rep);
7387 goto error;
7388 }
7389 out = PyBytes_AS_STRING(*outbytes) + offset;
7390 }
7391 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7392 out += outsize;
7393 }
7394 else {
7395 Py_ssize_t i;
7396 enum PyUnicode_Kind kind;
7397 void *data;
7398
7399 if (PyUnicode_READY(rep) < 0) {
7400 Py_DECREF(rep);
7401 goto error;
7402 }
7403
7404 outsize = PyUnicode_GET_LENGTH(rep);
7405 if (outsize != 1) {
7406 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7407 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7408 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7409 Py_DECREF(rep);
7410 goto error;
7411 }
7412 out = PyBytes_AS_STRING(*outbytes) + offset;
7413 }
7414 kind = PyUnicode_KIND(rep);
7415 data = PyUnicode_DATA(rep);
7416 for (i=0; i < outsize; i++) {
7417 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7418 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007419 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 encoding, unicode,
7421 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 "unable to encode error handler result to ASCII");
7423 Py_DECREF(rep);
7424 goto error;
7425 }
7426 *out = (unsigned char)ch;
7427 out++;
7428 }
7429 }
7430 Py_DECREF(rep);
7431 }
7432 /* write a NUL byte */
7433 *out = 0;
7434 outsize = out - PyBytes_AS_STRING(*outbytes);
7435 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7436 if (_PyBytes_Resize(outbytes, outsize) < 0)
7437 goto error;
7438 ret = 0;
7439
7440error:
7441 Py_XDECREF(encoding_obj);
7442 Py_XDECREF(errorHandler);
7443 Py_XDECREF(exc);
7444 return ret;
7445}
7446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447static PyObject *
7448encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007449 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007450 const char *errors)
7451{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007452 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007454 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007456
Victor Stinner2fc507f2011-11-04 20:06:39 +01007457 if (PyUnicode_READY(unicode) < 0)
7458 return NULL;
7459 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007460
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 if (code_page < 0) {
7462 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7463 return NULL;
7464 }
7465
Martin v. Löwis3d325192011-11-04 18:23:06 +01007466 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007467 return PyBytes_FromStringAndSize(NULL, 0);
7468
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 offset = 0;
7470 do
7471 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007473 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474 chunks. */
7475 if (len > INT_MAX/2) {
7476 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007477 done = 0;
7478 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007483 done = 1;
7484 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007485
Victor Stinner76a31a62011-11-04 00:05:13 +01007486 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007488 errors);
7489 if (ret == -2)
7490 ret = encode_code_page_errors(code_page, &outbytes,
7491 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007493 if (ret < 0) {
7494 Py_XDECREF(outbytes);
7495 return NULL;
7496 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497
Victor Stinner7581cef2011-11-03 22:32:33 +01007498 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007500 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007501
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 return outbytes;
7503}
7504
7505PyObject *
7506PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7507 Py_ssize_t size,
7508 const char *errors)
7509{
Victor Stinner7581cef2011-11-03 22:32:33 +01007510 PyObject *unicode, *res;
7511 unicode = PyUnicode_FromUnicode(p, size);
7512 if (unicode == NULL)
7513 return NULL;
7514 res = encode_code_page(CP_ACP, unicode, errors);
7515 Py_DECREF(unicode);
7516 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007517}
7518
7519PyObject *
7520PyUnicode_EncodeCodePage(int code_page,
7521 PyObject *unicode,
7522 const char *errors)
7523{
Victor Stinner7581cef2011-11-03 22:32:33 +01007524 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007525}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007526
Alexander Belopolsky40018472011-02-26 01:02:56 +00007527PyObject *
7528PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007529{
7530 if (!PyUnicode_Check(unicode)) {
7531 PyErr_BadArgument();
7532 return NULL;
7533 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007534 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007535}
7536
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537#undef NEED_RETRY
7538
Victor Stinner99b95382011-07-04 14:23:54 +02007539#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007540
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541/* --- Character Mapping Codec -------------------------------------------- */
7542
Alexander Belopolsky40018472011-02-26 01:02:56 +00007543PyObject *
7544PyUnicode_DecodeCharmap(const char *s,
7545 Py_ssize_t size,
7546 PyObject *mapping,
7547 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007549 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 Py_ssize_t startinpos;
7551 Py_ssize_t endinpos;
7552 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007554 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007555 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 PyObject *errorHandler = NULL;
7557 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007558
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 /* Default to Latin-1 */
7560 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007563 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007567 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007568 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007570 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007571 Py_ssize_t maplen;
7572 enum PyUnicode_Kind kind;
7573 void *data;
7574 Py_UCS4 x;
7575
7576 if (PyUnicode_READY(mapping) < 0)
7577 return NULL;
7578
7579 maplen = PyUnicode_GET_LENGTH(mapping);
7580 data = PyUnicode_DATA(mapping);
7581 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007582 while (s < e) {
7583 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007586 x = PyUnicode_READ(kind, data, ch);
7587 else
7588 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007590 if (x == 0xfffe)
7591 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 startinpos = s-starts;
7594 endinpos = startinpos+1;
7595 if (unicode_decode_call_errorhandler(
7596 errors, &errorHandler,
7597 "charmap", "character maps to <undefined>",
7598 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007599 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 goto onError;
7601 }
7602 continue;
7603 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007604
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007605 if (unicode_putchar(&v, &outpos, x) < 0)
7606 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007609 }
7610 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 while (s < e) {
7612 unsigned char ch = *s;
7613 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007614
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7616 w = PyLong_FromLong((long)ch);
7617 if (w == NULL)
7618 goto onError;
7619 x = PyObject_GetItem(mapping, w);
7620 Py_DECREF(w);
7621 if (x == NULL) {
7622 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7623 /* No mapping found means: mapping is undefined. */
7624 PyErr_Clear();
7625 x = Py_None;
7626 Py_INCREF(x);
7627 } else
7628 goto onError;
7629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 /* Apply mapping */
7632 if (PyLong_Check(x)) {
7633 long value = PyLong_AS_LONG(x);
7634 if (value < 0 || value > 65535) {
7635 PyErr_SetString(PyExc_TypeError,
7636 "character mapping must be in range(65536)");
7637 Py_DECREF(x);
7638 goto onError;
7639 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007640 if (unicode_putchar(&v, &outpos, value) < 0)
7641 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 }
7643 else if (x == Py_None) {
7644 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 startinpos = s-starts;
7646 endinpos = startinpos+1;
7647 if (unicode_decode_call_errorhandler(
7648 errors, &errorHandler,
7649 "charmap", "character maps to <undefined>",
7650 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007651 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007652 Py_DECREF(x);
7653 goto onError;
7654 }
7655 Py_DECREF(x);
7656 continue;
7657 }
7658 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007659 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007661 if (PyUnicode_READY(x) < 0)
7662 goto onError;
7663 targetsize = PyUnicode_GET_LENGTH(x);
7664
7665 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007667 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007668 PyUnicode_READ_CHAR(x, 0)) < 0)
7669 goto onError;
7670 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 else if (targetsize > 1) {
7672 /* 1-n mapping */
7673 if (targetsize > extrachars) {
7674 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 Py_ssize_t needed = (targetsize - extrachars) + \
7676 (targetsize << 2);
7677 extrachars += needed;
7678 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007679 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007680 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 Py_DECREF(x);
7682 goto onError;
7683 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007685 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7686 goto onError;
7687 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7688 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 extrachars -= targetsize;
7690 }
7691 /* 1-0 mapping: skip the character */
7692 }
7693 else {
7694 /* wrong return value */
7695 PyErr_SetString(PyExc_TypeError,
7696 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007697 Py_DECREF(x);
7698 goto onError;
7699 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007700 Py_DECREF(x);
7701 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007704 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007705 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 Py_XDECREF(errorHandler);
7707 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007708 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007709
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007711 Py_XDECREF(errorHandler);
7712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 Py_XDECREF(v);
7714 return NULL;
7715}
7716
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717/* Charmap encoding: the lookup table */
7718
Alexander Belopolsky40018472011-02-26 01:02:56 +00007719struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 PyObject_HEAD
7721 unsigned char level1[32];
7722 int count2, count3;
7723 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007724};
7725
7726static PyObject*
7727encoding_map_size(PyObject *obj, PyObject* args)
7728{
7729 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732}
7733
7734static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007735 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 PyDoc_STR("Return the size (in bytes) of this object") },
7737 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007738};
7739
7740static void
7741encoding_map_dealloc(PyObject* o)
7742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007743 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744}
7745
7746static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 "EncodingMap", /*tp_name*/
7749 sizeof(struct encoding_map), /*tp_basicsize*/
7750 0, /*tp_itemsize*/
7751 /* methods */
7752 encoding_map_dealloc, /*tp_dealloc*/
7753 0, /*tp_print*/
7754 0, /*tp_getattr*/
7755 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007756 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 0, /*tp_repr*/
7758 0, /*tp_as_number*/
7759 0, /*tp_as_sequence*/
7760 0, /*tp_as_mapping*/
7761 0, /*tp_hash*/
7762 0, /*tp_call*/
7763 0, /*tp_str*/
7764 0, /*tp_getattro*/
7765 0, /*tp_setattro*/
7766 0, /*tp_as_buffer*/
7767 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7768 0, /*tp_doc*/
7769 0, /*tp_traverse*/
7770 0, /*tp_clear*/
7771 0, /*tp_richcompare*/
7772 0, /*tp_weaklistoffset*/
7773 0, /*tp_iter*/
7774 0, /*tp_iternext*/
7775 encoding_map_methods, /*tp_methods*/
7776 0, /*tp_members*/
7777 0, /*tp_getset*/
7778 0, /*tp_base*/
7779 0, /*tp_dict*/
7780 0, /*tp_descr_get*/
7781 0, /*tp_descr_set*/
7782 0, /*tp_dictoffset*/
7783 0, /*tp_init*/
7784 0, /*tp_alloc*/
7785 0, /*tp_new*/
7786 0, /*tp_free*/
7787 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788};
7789
7790PyObject*
7791PyUnicode_BuildEncodingMap(PyObject* string)
7792{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007793 PyObject *result;
7794 struct encoding_map *mresult;
7795 int i;
7796 int need_dict = 0;
7797 unsigned char level1[32];
7798 unsigned char level2[512];
7799 unsigned char *mlevel1, *mlevel2, *mlevel3;
7800 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007801 int kind;
7802 void *data;
7803 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007806 PyErr_BadArgument();
7807 return NULL;
7808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 kind = PyUnicode_KIND(string);
7810 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007811 memset(level1, 0xFF, sizeof level1);
7812 memset(level2, 0xFF, sizeof level2);
7813
7814 /* If there isn't a one-to-one mapping of NULL to \0,
7815 or if there are non-BMP characters, we need to use
7816 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818 need_dict = 1;
7819 for (i = 1; i < 256; i++) {
7820 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 ch = PyUnicode_READ(kind, data, i);
7822 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 need_dict = 1;
7824 break;
7825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 /* unmapped character */
7828 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007829 l1 = ch >> 11;
7830 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831 if (level1[l1] == 0xFF)
7832 level1[l1] = count2++;
7833 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835 }
7836
7837 if (count2 >= 0xFF || count3 >= 0xFF)
7838 need_dict = 1;
7839
7840 if (need_dict) {
7841 PyObject *result = PyDict_New();
7842 PyObject *key, *value;
7843 if (!result)
7844 return NULL;
7845 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007847 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 if (!key || !value)
7849 goto failed1;
7850 if (PyDict_SetItem(result, key, value) == -1)
7851 goto failed1;
7852 Py_DECREF(key);
7853 Py_DECREF(value);
7854 }
7855 return result;
7856 failed1:
7857 Py_XDECREF(key);
7858 Py_XDECREF(value);
7859 Py_DECREF(result);
7860 return NULL;
7861 }
7862
7863 /* Create a three-level trie */
7864 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7865 16*count2 + 128*count3 - 1);
7866 if (!result)
7867 return PyErr_NoMemory();
7868 PyObject_Init(result, &EncodingMapType);
7869 mresult = (struct encoding_map*)result;
7870 mresult->count2 = count2;
7871 mresult->count3 = count3;
7872 mlevel1 = mresult->level1;
7873 mlevel2 = mresult->level23;
7874 mlevel3 = mresult->level23 + 16*count2;
7875 memcpy(mlevel1, level1, 32);
7876 memset(mlevel2, 0xFF, 16*count2);
7877 memset(mlevel3, 0, 128*count3);
7878 count3 = 0;
7879 for (i = 1; i < 256; i++) {
7880 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 /* unmapped character */
7883 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 o1 = PyUnicode_READ(kind, data, i)>>11;
7885 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 i2 = 16*mlevel1[o1] + o2;
7887 if (mlevel2[i2] == 0xFF)
7888 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007890 i3 = 128*mlevel2[i2] + o3;
7891 mlevel3[i3] = i;
7892 }
7893 return result;
7894}
7895
7896static int
Victor Stinner22168992011-11-20 17:09:18 +01007897encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007898{
7899 struct encoding_map *map = (struct encoding_map*)mapping;
7900 int l1 = c>>11;
7901 int l2 = (c>>7) & 0xF;
7902 int l3 = c & 0x7F;
7903 int i;
7904
Victor Stinner22168992011-11-20 17:09:18 +01007905 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907 if (c == 0)
7908 return 0;
7909 /* level 1*/
7910 i = map->level1[l1];
7911 if (i == 0xFF) {
7912 return -1;
7913 }
7914 /* level 2*/
7915 i = map->level23[16*i+l2];
7916 if (i == 0xFF) {
7917 return -1;
7918 }
7919 /* level 3 */
7920 i = map->level23[16*map->count2 + 128*i + l3];
7921 if (i == 0) {
7922 return -1;
7923 }
7924 return i;
7925}
7926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007927/* Lookup the character ch in the mapping. If the character
7928 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007929 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007930static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007931charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932{
Christian Heimes217cfd12007-12-02 14:31:20 +00007933 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934 PyObject *x;
7935
7936 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938 x = PyObject_GetItem(mapping, w);
7939 Py_DECREF(w);
7940 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7942 /* No mapping found means: mapping is undefined. */
7943 PyErr_Clear();
7944 x = Py_None;
7945 Py_INCREF(x);
7946 return x;
7947 } else
7948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007950 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007952 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 long value = PyLong_AS_LONG(x);
7954 if (value < 0 || value > 255) {
7955 PyErr_SetString(PyExc_TypeError,
7956 "character mapping must be in range(256)");
7957 Py_DECREF(x);
7958 return NULL;
7959 }
7960 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007962 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 /* wrong return value */
7966 PyErr_Format(PyExc_TypeError,
7967 "character mapping must return integer, bytes or None, not %.400s",
7968 x->ob_type->tp_name);
7969 Py_DECREF(x);
7970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 }
7972}
7973
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007974static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007975charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007976{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7978 /* exponentially overallocate to minimize reallocations */
7979 if (requiredsize < 2*outsize)
7980 requiredsize = 2*outsize;
7981 if (_PyBytes_Resize(outobj, requiredsize))
7982 return -1;
7983 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007984}
7985
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007988} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007990 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007991 space is available. Return a new reference to the object that
7992 was put in the output buffer, or Py_None, if the mapping was undefined
7993 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007994 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007995static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007996charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007997 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007999 PyObject *rep;
8000 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008001 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002
Christian Heimes90aa7642007-12-19 02:45:37 +00008003 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008006 if (res == -1)
8007 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 if (outsize<requiredsize)
8009 if (charmapencode_resize(outobj, outpos, requiredsize))
8010 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008011 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 outstart[(*outpos)++] = (char)res;
8013 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014 }
8015
8016 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 Py_DECREF(rep);
8021 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 if (PyLong_Check(rep)) {
8024 Py_ssize_t requiredsize = *outpos+1;
8025 if (outsize<requiredsize)
8026 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8027 Py_DECREF(rep);
8028 return enc_EXCEPTION;
8029 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008030 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008032 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 else {
8034 const char *repchars = PyBytes_AS_STRING(rep);
8035 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8036 Py_ssize_t requiredsize = *outpos+repsize;
8037 if (outsize<requiredsize)
8038 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8039 Py_DECREF(rep);
8040 return enc_EXCEPTION;
8041 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008042 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 memcpy(outstart + *outpos, repchars, repsize);
8044 *outpos += repsize;
8045 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008046 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 Py_DECREF(rep);
8048 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049}
8050
8051/* handle an error in PyUnicode_EncodeCharmap
8052 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008053static int
8054charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008055 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008056 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008057 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008058 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059{
8060 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008061 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008062 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008063 enum PyUnicode_Kind kind;
8064 void *data;
8065 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008067 Py_ssize_t collstartpos = *inpos;
8068 Py_ssize_t collendpos = *inpos+1;
8069 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 char *encoding = "charmap";
8071 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008073 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008074 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008076 if (PyUnicode_READY(unicode) < 0)
8077 return -1;
8078 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 /* find all unencodable characters */
8080 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008082 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008083 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008084 val = encoding_map_lookup(ch, mapping);
8085 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 break;
8087 ++collendpos;
8088 continue;
8089 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008090
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8092 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 if (rep==NULL)
8094 return -1;
8095 else if (rep!=Py_None) {
8096 Py_DECREF(rep);
8097 break;
8098 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008099 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 }
8102 /* cache callback name lookup
8103 * (if not done yet, i.e. it's the first error) */
8104 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 if ((errors==NULL) || (!strcmp(errors, "strict")))
8106 *known_errorHandler = 1;
8107 else if (!strcmp(errors, "replace"))
8108 *known_errorHandler = 2;
8109 else if (!strcmp(errors, "ignore"))
8110 *known_errorHandler = 3;
8111 else if (!strcmp(errors, "xmlcharrefreplace"))
8112 *known_errorHandler = 4;
8113 else
8114 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 }
8116 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008118 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 return -1;
8120 case 2: /* replace */
8121 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 x = charmapencode_output('?', mapping, res, respos);
8123 if (x==enc_EXCEPTION) {
8124 return -1;
8125 }
8126 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008127 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 return -1;
8129 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 }
8131 /* fall through */
8132 case 3: /* ignore */
8133 *inpos = collendpos;
8134 break;
8135 case 4: /* xmlcharrefreplace */
8136 /* generate replacement (temporarily (mis)uses p) */
8137 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 char buffer[2+29+1+1];
8139 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008140 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 for (cp = buffer; *cp; ++cp) {
8142 x = charmapencode_output(*cp, mapping, res, respos);
8143 if (x==enc_EXCEPTION)
8144 return -1;
8145 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008146 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 return -1;
8148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 }
8150 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151 *inpos = collendpos;
8152 break;
8153 default:
8154 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008155 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008159 if (PyBytes_Check(repunicode)) {
8160 /* Directly copy bytes result to output. */
8161 Py_ssize_t outsize = PyBytes_Size(*res);
8162 Py_ssize_t requiredsize;
8163 repsize = PyBytes_Size(repunicode);
8164 requiredsize = *respos + repsize;
8165 if (requiredsize > outsize)
8166 /* Make room for all additional bytes. */
8167 if (charmapencode_resize(res, respos, requiredsize)) {
8168 Py_DECREF(repunicode);
8169 return -1;
8170 }
8171 memcpy(PyBytes_AsString(*res) + *respos,
8172 PyBytes_AsString(repunicode), repsize);
8173 *respos += repsize;
8174 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008175 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008176 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008179 if (PyUnicode_READY(repunicode) < 0) {
8180 Py_DECREF(repunicode);
8181 return -1;
8182 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008183 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008184 data = PyUnicode_DATA(repunicode);
8185 kind = PyUnicode_KIND(repunicode);
8186 for (index = 0; index < repsize; index++) {
8187 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8188 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008190 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return -1;
8192 }
8193 else if (x==enc_FAILED) {
8194 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008195 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return -1;
8197 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008198 }
8199 *inpos = newpos;
8200 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201 }
8202 return 0;
8203}
8204
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008206_PyUnicode_EncodeCharmap(PyObject *unicode,
8207 PyObject *mapping,
8208 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210 /* output object */
8211 PyObject *res = NULL;
8212 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008213 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008214 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008216 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217 PyObject *errorHandler = NULL;
8218 PyObject *exc = NULL;
8219 /* the following variable is used for caching string comparisons
8220 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8221 * 3=ignore, 4=xmlcharrefreplace */
8222 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008224 if (PyUnicode_READY(unicode) < 0)
8225 return NULL;
8226 size = PyUnicode_GET_LENGTH(unicode);
8227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 /* Default to Latin-1 */
8229 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232 /* allocate enough for a simple encoding without
8233 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008234 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 if (res == NULL)
8236 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008237 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008241 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008243 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 if (x==enc_EXCEPTION) /* error */
8245 goto onError;
8246 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008247 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 &exc,
8249 &known_errorHandler, &errorHandler, errors,
8250 &res, &respos)) {
8251 goto onError;
8252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008253 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 else
8255 /* done with this character => adjust input position */
8256 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008260 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008261 if (_PyBytes_Resize(&res, respos) < 0)
8262 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 Py_XDECREF(exc);
8265 Py_XDECREF(errorHandler);
8266 return res;
8267
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 Py_XDECREF(res);
8270 Py_XDECREF(exc);
8271 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 return NULL;
8273}
8274
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275/* Deprecated */
8276PyObject *
8277PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8278 Py_ssize_t size,
8279 PyObject *mapping,
8280 const char *errors)
8281{
8282 PyObject *result;
8283 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8284 if (unicode == NULL)
8285 return NULL;
8286 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8287 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008288 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289}
8290
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291PyObject *
8292PyUnicode_AsCharmapString(PyObject *unicode,
8293 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294{
8295 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 PyErr_BadArgument();
8297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008299 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300}
8301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008303static void
8304make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306 Py_ssize_t startpos, Py_ssize_t endpos,
8307 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 *exceptionObject = _PyUnicodeTranslateError_Create(
8311 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 }
8313 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8315 goto onError;
8316 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8317 goto onError;
8318 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8319 goto onError;
8320 return;
8321 onError:
8322 Py_DECREF(*exceptionObject);
8323 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
8325}
8326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008327/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008328static void
8329raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331 Py_ssize_t startpos, Py_ssize_t endpos,
8332 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333{
8334 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338}
8339
8340/* error handling callback helper:
8341 build arguments, call the callback and check the arguments,
8342 put the result into newpos and return the replacement string, which
8343 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008344static PyObject *
8345unicode_translate_call_errorhandler(const char *errors,
8346 PyObject **errorHandler,
8347 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008349 Py_ssize_t startpos, Py_ssize_t endpos,
8350 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008352 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008354 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 PyObject *restuple;
8356 PyObject *resunicode;
8357
8358 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 }
8363
8364 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368
8369 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008374 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 Py_DECREF(restuple);
8376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 }
8378 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 &resunicode, &i_newpos)) {
8380 Py_DECREF(restuple);
8381 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008383 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008385 else
8386 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8389 Py_DECREF(restuple);
8390 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008391 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 Py_INCREF(resunicode);
8393 Py_DECREF(restuple);
8394 return resunicode;
8395}
8396
8397/* Lookup the character ch in the mapping and put the result in result,
8398 which must be decrefed by the caller.
8399 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402{
Christian Heimes217cfd12007-12-02 14:31:20 +00008403 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 PyObject *x;
8405
8406 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 x = PyObject_GetItem(mapping, w);
8409 Py_DECREF(w);
8410 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8412 /* No mapping found means: use 1:1 mapping. */
8413 PyErr_Clear();
8414 *result = NULL;
8415 return 0;
8416 } else
8417 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 }
8419 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 *result = x;
8421 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008423 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 long value = PyLong_AS_LONG(x);
8425 long max = PyUnicode_GetMax();
8426 if (value < 0 || value > max) {
8427 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008428 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 Py_DECREF(x);
8430 return -1;
8431 }
8432 *result = x;
8433 return 0;
8434 }
8435 else if (PyUnicode_Check(x)) {
8436 *result = x;
8437 return 0;
8438 }
8439 else {
8440 /* wrong return value */
8441 PyErr_SetString(PyExc_TypeError,
8442 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 Py_DECREF(x);
8444 return -1;
8445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446}
8447/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 if not reallocate and adjust various state variables.
8449 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008450static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008455 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 /* exponentially overallocate to minimize reallocations */
8457 if (requiredsize < 2 * oldsize)
8458 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8460 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 }
8464 return 0;
8465}
8466/* lookup the character, put the result in the output string and adjust
8467 various state variables. Return a new reference to the object that
8468 was put in the output buffer in *result, or Py_None, if the mapping was
8469 undefined (in which case no character was written).
8470 The called must decref result.
8471 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008472static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8474 PyObject *mapping, Py_UCS4 **output,
8475 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008476 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8479 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 }
8485 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008487 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 }
8491 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 Py_ssize_t repsize;
8493 if (PyUnicode_READY(*res) == -1)
8494 return -1;
8495 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 if (repsize==1) {
8497 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 }
8500 else if (repsize!=0) {
8501 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 Py_ssize_t requiredsize = *opos +
8503 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_ssize_t i;
8506 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 for(i = 0; i < repsize; i++)
8509 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 }
8512 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 return 0;
8515}
8516
Alexander Belopolsky40018472011-02-26 01:02:56 +00008517PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518_PyUnicode_TranslateCharmap(PyObject *input,
8519 PyObject *mapping,
8520 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 /* input object */
8523 char *idata;
8524 Py_ssize_t size, i;
8525 int kind;
8526 /* output buffer */
8527 Py_UCS4 *output = NULL;
8528 Py_ssize_t osize;
8529 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 char *reason = "character maps to <undefined>";
8533 PyObject *errorHandler = NULL;
8534 PyObject *exc = NULL;
8535 /* the following variable is used for caching string comparisons
8536 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8537 * 3=ignore, 4=xmlcharrefreplace */
8538 int known_errorHandler = -1;
8539
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 PyErr_BadArgument();
8542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 if (PyUnicode_READY(input) == -1)
8546 return NULL;
8547 idata = (char*)PyUnicode_DATA(input);
8548 kind = PyUnicode_KIND(input);
8549 size = PyUnicode_GET_LENGTH(input);
8550 i = 0;
8551
8552 if (size == 0) {
8553 Py_INCREF(input);
8554 return input;
8555 }
8556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 /* allocate enough for a simple 1:1 translation without
8558 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 osize = size;
8560 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8561 opos = 0;
8562 if (output == NULL) {
8563 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008567 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 /* try to encode it */
8569 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 if (charmaptranslate_output(input, i, mapping,
8571 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 Py_XDECREF(x);
8573 goto onError;
8574 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008575 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008577 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 else { /* untranslatable character */
8579 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8580 Py_ssize_t repsize;
8581 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 Py_ssize_t collstart = i;
8585 Py_ssize_t collend = i+1;
8586 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 while (collend < size) {
8590 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 goto onError;
8592 Py_XDECREF(x);
8593 if (x!=Py_None)
8594 break;
8595 ++collend;
8596 }
8597 /* cache callback name lookup
8598 * (if not done yet, i.e. it's the first error) */
8599 if (known_errorHandler==-1) {
8600 if ((errors==NULL) || (!strcmp(errors, "strict")))
8601 known_errorHandler = 1;
8602 else if (!strcmp(errors, "replace"))
8603 known_errorHandler = 2;
8604 else if (!strcmp(errors, "ignore"))
8605 known_errorHandler = 3;
8606 else if (!strcmp(errors, "xmlcharrefreplace"))
8607 known_errorHandler = 4;
8608 else
8609 known_errorHandler = 0;
8610 }
8611 switch (known_errorHandler) {
8612 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 raise_translate_exception(&exc, input, collstart,
8614 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008615 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 case 2: /* replace */
8617 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 for (coll = collstart; coll<collend; coll++)
8619 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 /* fall through */
8621 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 break;
8624 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 /* generate replacement (temporarily (mis)uses i) */
8626 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 char buffer[2+29+1+1];
8628 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8630 if (charmaptranslate_makespace(&output, &osize,
8631 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 goto onError;
8633 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 break;
8638 default:
8639 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640 reason, input, &exc,
8641 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008642 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008644 if (PyUnicode_READY(repunicode) < 0) {
8645 Py_DECREF(repunicode);
8646 goto onError;
8647 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 repsize = PyUnicode_GET_LENGTH(repunicode);
8650 if (charmaptranslate_makespace(&output, &osize,
8651 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 Py_DECREF(repunicode);
8653 goto onError;
8654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 for (uni2 = 0; repsize-->0; ++uni2)
8656 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8657 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008660 }
8661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8663 if (!res)
8664 goto onError;
8665 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 Py_XDECREF(exc);
8667 Py_XDECREF(errorHandler);
8668 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 Py_XDECREF(exc);
8673 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 return NULL;
8675}
8676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677/* Deprecated. Use PyUnicode_Translate instead. */
8678PyObject *
8679PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8680 Py_ssize_t size,
8681 PyObject *mapping,
8682 const char *errors)
8683{
8684 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8685 if (!unicode)
8686 return NULL;
8687 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8688}
8689
Alexander Belopolsky40018472011-02-26 01:02:56 +00008690PyObject *
8691PyUnicode_Translate(PyObject *str,
8692 PyObject *mapping,
8693 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694{
8695 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008696
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 str = PyUnicode_FromObject(str);
8698 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 Py_DECREF(str);
8702 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008703
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 Py_XDECREF(str);
8706 return NULL;
8707}
Tim Petersced69f82003-09-16 20:30:58 +00008708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008710fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711{
8712 /* No need to call PyUnicode_READY(self) because this function is only
8713 called as a callback from fixup() which does it already. */
8714 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8715 const int kind = PyUnicode_KIND(self);
8716 void *data = PyUnicode_DATA(self);
8717 Py_UCS4 maxchar = 0, ch, fixed;
8718 Py_ssize_t i;
8719
8720 for (i = 0; i < len; ++i) {
8721 ch = PyUnicode_READ(kind, data, i);
8722 fixed = 0;
8723 if (ch > 127) {
8724 if (Py_UNICODE_ISSPACE(ch))
8725 fixed = ' ';
8726 else {
8727 const int decimal = Py_UNICODE_TODECIMAL(ch);
8728 if (decimal >= 0)
8729 fixed = '0' + decimal;
8730 }
8731 if (fixed != 0) {
8732 if (fixed > maxchar)
8733 maxchar = fixed;
8734 PyUnicode_WRITE(kind, data, i, fixed);
8735 }
8736 else if (ch > maxchar)
8737 maxchar = ch;
8738 }
8739 else if (ch > maxchar)
8740 maxchar = ch;
8741 }
8742
8743 return maxchar;
8744}
8745
8746PyObject *
8747_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8748{
8749 if (!PyUnicode_Check(unicode)) {
8750 PyErr_BadInternalCall();
8751 return NULL;
8752 }
8753 if (PyUnicode_READY(unicode) == -1)
8754 return NULL;
8755 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8756 /* If the string is already ASCII, just return the same string */
8757 Py_INCREF(unicode);
8758 return unicode;
8759 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008760 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761}
8762
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008763PyObject *
8764PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8765 Py_ssize_t length)
8766{
Victor Stinnerf0124502011-11-21 23:12:56 +01008767 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008768 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008769 Py_UCS4 maxchar;
8770 enum PyUnicode_Kind kind;
8771 void *data;
8772
8773 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008774 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008775 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008776 if (ch > 127) {
8777 int decimal = Py_UNICODE_TODECIMAL(ch);
8778 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008779 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008780 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008781 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008782 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008783
8784 /* Copy to a new string */
8785 decimal = PyUnicode_New(length, maxchar);
8786 if (decimal == NULL)
8787 return decimal;
8788 kind = PyUnicode_KIND(decimal);
8789 data = PyUnicode_DATA(decimal);
8790 /* Iterate over code points */
8791 for (i = 0; i < length; i++) {
8792 Py_UNICODE ch = s[i];
8793 if (ch > 127) {
8794 int decimal = Py_UNICODE_TODECIMAL(ch);
8795 if (decimal >= 0)
8796 ch = '0' + decimal;
8797 }
8798 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008800 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008801}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008802/* --- Decimal Encoder ---------------------------------------------------- */
8803
Alexander Belopolsky40018472011-02-26 01:02:56 +00008804int
8805PyUnicode_EncodeDecimal(Py_UNICODE *s,
8806 Py_ssize_t length,
8807 char *output,
8808 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008809{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008810 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008811 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008812 enum PyUnicode_Kind kind;
8813 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008814
8815 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 PyErr_BadArgument();
8817 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008818 }
8819
Victor Stinner42bf7752011-11-21 22:52:58 +01008820 unicode = PyUnicode_FromUnicode(s, length);
8821 if (unicode == NULL)
8822 return -1;
8823
Victor Stinner6345be92011-11-25 20:09:01 +01008824 if (PyUnicode_READY(unicode) < 0) {
8825 Py_DECREF(unicode);
8826 return -1;
8827 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008828 kind = PyUnicode_KIND(unicode);
8829 data = PyUnicode_DATA(unicode);
8830
Victor Stinnerb84d7232011-11-22 01:50:07 +01008831 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008832 PyObject *exc;
8833 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008835 Py_ssize_t startpos;
8836
8837 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008838
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008841 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008843 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 decimal = Py_UNICODE_TODECIMAL(ch);
8845 if (decimal >= 0) {
8846 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008847 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 continue;
8849 }
8850 if (0 < ch && ch < 256) {
8851 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008852 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 continue;
8854 }
Victor Stinner6345be92011-11-25 20:09:01 +01008855
Victor Stinner42bf7752011-11-21 22:52:58 +01008856 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008857 exc = NULL;
8858 raise_encode_exception(&exc, "decimal", unicode,
8859 startpos, startpos+1,
8860 "invalid decimal Unicode string");
8861 Py_XDECREF(exc);
8862 Py_DECREF(unicode);
8863 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864 }
8865 /* 0-terminate the output string */
8866 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008867 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008868 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008869}
8870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871/* --- Helpers ------------------------------------------------------------ */
8872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008874any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 Py_ssize_t start,
8876 Py_ssize_t end)
8877{
8878 int kind1, kind2, kind;
8879 void *buf1, *buf2;
8880 Py_ssize_t len1, len2, result;
8881
8882 kind1 = PyUnicode_KIND(s1);
8883 kind2 = PyUnicode_KIND(s2);
8884 kind = kind1 > kind2 ? kind1 : kind2;
8885 buf1 = PyUnicode_DATA(s1);
8886 buf2 = PyUnicode_DATA(s2);
8887 if (kind1 != kind)
8888 buf1 = _PyUnicode_AsKind(s1, kind);
8889 if (!buf1)
8890 return -2;
8891 if (kind2 != kind)
8892 buf2 = _PyUnicode_AsKind(s2, kind);
8893 if (!buf2) {
8894 if (kind1 != kind) PyMem_Free(buf1);
8895 return -2;
8896 }
8897 len1 = PyUnicode_GET_LENGTH(s1);
8898 len2 = PyUnicode_GET_LENGTH(s2);
8899
Victor Stinner794d5672011-10-10 03:21:36 +02008900 if (direction > 0) {
8901 switch(kind) {
8902 case PyUnicode_1BYTE_KIND:
8903 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8904 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8905 else
8906 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8907 break;
8908 case PyUnicode_2BYTE_KIND:
8909 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8910 break;
8911 case PyUnicode_4BYTE_KIND:
8912 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8913 break;
8914 default:
8915 assert(0); result = -2;
8916 }
8917 }
8918 else {
8919 switch(kind) {
8920 case PyUnicode_1BYTE_KIND:
8921 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8922 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8923 else
8924 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8925 break;
8926 case PyUnicode_2BYTE_KIND:
8927 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8928 break;
8929 case PyUnicode_4BYTE_KIND:
8930 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8931 break;
8932 default:
8933 assert(0); result = -2;
8934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 }
8936
8937 if (kind1 != kind)
8938 PyMem_Free(buf1);
8939 if (kind2 != kind)
8940 PyMem_Free(buf2);
8941
8942 return result;
8943}
8944
8945Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008946_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 Py_ssize_t n_buffer,
8948 void *digits, Py_ssize_t n_digits,
8949 Py_ssize_t min_width,
8950 const char *grouping,
8951 const char *thousands_sep)
8952{
8953 switch(kind) {
8954 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008955 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8956 return _PyUnicode_ascii_InsertThousandsGrouping(
8957 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8958 min_width, grouping, thousands_sep);
8959 else
8960 return _PyUnicode_ucs1_InsertThousandsGrouping(
8961 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8962 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 case PyUnicode_2BYTE_KIND:
8964 return _PyUnicode_ucs2_InsertThousandsGrouping(
8965 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8966 min_width, grouping, thousands_sep);
8967 case PyUnicode_4BYTE_KIND:
8968 return _PyUnicode_ucs4_InsertThousandsGrouping(
8969 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8970 min_width, grouping, thousands_sep);
8971 }
8972 assert(0);
8973 return -1;
8974}
8975
8976
Thomas Wouters477c8d52006-05-27 19:21:47 +00008977/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008978#define ADJUST_INDICES(start, end, len) \
8979 if (end > len) \
8980 end = len; \
8981 else if (end < 0) { \
8982 end += len; \
8983 if (end < 0) \
8984 end = 0; \
8985 } \
8986 if (start < 0) { \
8987 start += len; \
8988 if (start < 0) \
8989 start = 0; \
8990 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008991
Alexander Belopolsky40018472011-02-26 01:02:56 +00008992Py_ssize_t
8993PyUnicode_Count(PyObject *str,
8994 PyObject *substr,
8995 Py_ssize_t start,
8996 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008998 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008999 PyObject* str_obj;
9000 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 int kind1, kind2, kind;
9002 void *buf1 = NULL, *buf2 = NULL;
9003 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009004
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009005 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009008 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009009 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 Py_DECREF(str_obj);
9011 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 }
Tim Petersced69f82003-09-16 20:30:58 +00009013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 kind1 = PyUnicode_KIND(str_obj);
9015 kind2 = PyUnicode_KIND(sub_obj);
9016 kind = kind1 > kind2 ? kind1 : kind2;
9017 buf1 = PyUnicode_DATA(str_obj);
9018 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009019 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020 if (!buf1)
9021 goto onError;
9022 buf2 = PyUnicode_DATA(sub_obj);
9023 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009024 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 if (!buf2)
9026 goto onError;
9027 len1 = PyUnicode_GET_LENGTH(str_obj);
9028 len2 = PyUnicode_GET_LENGTH(sub_obj);
9029
9030 ADJUST_INDICES(start, end, len1);
9031 switch(kind) {
9032 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009033 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9034 result = asciilib_count(
9035 ((Py_UCS1*)buf1) + start, end - start,
9036 buf2, len2, PY_SSIZE_T_MAX
9037 );
9038 else
9039 result = ucs1lib_count(
9040 ((Py_UCS1*)buf1) + start, end - start,
9041 buf2, len2, PY_SSIZE_T_MAX
9042 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 break;
9044 case PyUnicode_2BYTE_KIND:
9045 result = ucs2lib_count(
9046 ((Py_UCS2*)buf1) + start, end - start,
9047 buf2, len2, PY_SSIZE_T_MAX
9048 );
9049 break;
9050 case PyUnicode_4BYTE_KIND:
9051 result = ucs4lib_count(
9052 ((Py_UCS4*)buf1) + start, end - start,
9053 buf2, len2, PY_SSIZE_T_MAX
9054 );
9055 break;
9056 default:
9057 assert(0); result = 0;
9058 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009059
9060 Py_DECREF(sub_obj);
9061 Py_DECREF(str_obj);
9062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (kind1 != kind)
9064 PyMem_Free(buf1);
9065 if (kind2 != kind)
9066 PyMem_Free(buf2);
9067
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 onError:
9070 Py_DECREF(sub_obj);
9071 Py_DECREF(str_obj);
9072 if (kind1 != kind && buf1)
9073 PyMem_Free(buf1);
9074 if (kind2 != kind && buf2)
9075 PyMem_Free(buf2);
9076 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077}
9078
Alexander Belopolsky40018472011-02-26 01:02:56 +00009079Py_ssize_t
9080PyUnicode_Find(PyObject *str,
9081 PyObject *sub,
9082 Py_ssize_t start,
9083 Py_ssize_t end,
9084 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009086 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009087
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009091 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 Py_DECREF(str);
9094 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 }
Tim Petersced69f82003-09-16 20:30:58 +00009096
Victor Stinner794d5672011-10-10 03:21:36 +02009097 result = any_find_slice(direction,
9098 str, sub, start, end
9099 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009100
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102 Py_DECREF(sub);
9103
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 return result;
9105}
9106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107Py_ssize_t
9108PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9109 Py_ssize_t start, Py_ssize_t end,
9110 int direction)
9111{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009113 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 if (PyUnicode_READY(str) == -1)
9115 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009116 if (start < 0 || end < 0) {
9117 PyErr_SetString(PyExc_IndexError, "string index out of range");
9118 return -2;
9119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 if (end > PyUnicode_GET_LENGTH(str))
9121 end = PyUnicode_GET_LENGTH(str);
9122 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009123 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9124 kind, end-start, ch, direction);
9125 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009127 else
9128 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129}
9130
Alexander Belopolsky40018472011-02-26 01:02:56 +00009131static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009132tailmatch(PyObject *self,
9133 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009134 Py_ssize_t start,
9135 Py_ssize_t end,
9136 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 int kind_self;
9139 int kind_sub;
9140 void *data_self;
9141 void *data_sub;
9142 Py_ssize_t offset;
9143 Py_ssize_t i;
9144 Py_ssize_t end_sub;
9145
9146 if (PyUnicode_READY(self) == -1 ||
9147 PyUnicode_READY(substring) == -1)
9148 return 0;
9149
9150 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 return 1;
9152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9154 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 kind_self = PyUnicode_KIND(self);
9159 data_self = PyUnicode_DATA(self);
9160 kind_sub = PyUnicode_KIND(substring);
9161 data_sub = PyUnicode_DATA(substring);
9162 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9163
9164 if (direction > 0)
9165 offset = end;
9166 else
9167 offset = start;
9168
9169 if (PyUnicode_READ(kind_self, data_self, offset) ==
9170 PyUnicode_READ(kind_sub, data_sub, 0) &&
9171 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9172 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9173 /* If both are of the same kind, memcmp is sufficient */
9174 if (kind_self == kind_sub) {
9175 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009176 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 data_sub,
9178 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009179 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 }
9181 /* otherwise we have to compare each character by first accesing it */
9182 else {
9183 /* We do not need to compare 0 and len(substring)-1 because
9184 the if statement above ensured already that they are equal
9185 when we end up here. */
9186 // TODO: honor direction and do a forward or backwards search
9187 for (i = 1; i < end_sub; ++i) {
9188 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9189 PyUnicode_READ(kind_sub, data_sub, i))
9190 return 0;
9191 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009192 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194 }
9195
9196 return 0;
9197}
9198
Alexander Belopolsky40018472011-02-26 01:02:56 +00009199Py_ssize_t
9200PyUnicode_Tailmatch(PyObject *str,
9201 PyObject *substr,
9202 Py_ssize_t start,
9203 Py_ssize_t end,
9204 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009206 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 str = PyUnicode_FromObject(str);
9209 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 substr = PyUnicode_FromObject(substr);
9212 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 Py_DECREF(str);
9214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215 }
Tim Petersced69f82003-09-16 20:30:58 +00009216
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009217 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 Py_DECREF(str);
9220 Py_DECREF(substr);
9221 return result;
9222}
9223
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224/* Apply fixfct filter to the Unicode object self and return a
9225 reference to the modified object */
9226
Alexander Belopolsky40018472011-02-26 01:02:56 +00009227static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009228fixup(PyObject *self,
9229 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231 PyObject *u;
9232 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233
Victor Stinner87af4f22011-11-21 23:03:47 +01009234 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009237 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 /* fix functions return the new maximum character in a string,
9240 if the kind of the resulting unicode object does not change,
9241 everything is fine. Otherwise we need to change the string kind
9242 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009243 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (maxchar_new == 0)
9245 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9246 else if (maxchar_new <= 127)
9247 maxchar_new = 127;
9248 else if (maxchar_new <= 255)
9249 maxchar_new = 255;
9250 else if (maxchar_new <= 65535)
9251 maxchar_new = 65535;
9252 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009253 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254
9255 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009256 /* fixfct should return TRUE if it modified the buffer. If
9257 FALSE, return a reference to the original buffer instead
9258 (to save space, not time) */
9259 Py_INCREF(self);
9260 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009261 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 else if (maxchar_new == maxchar_old) {
9264 return u;
9265 }
9266 else {
9267 /* In case the maximum character changed, we need to
9268 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009269 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 if (v == NULL) {
9271 Py_DECREF(u);
9272 return NULL;
9273 }
9274 if (maxchar_new > maxchar_old) {
9275 /* If the maxchar increased so that the kind changed, not all
9276 characters are representable anymore and we need to fix the
9277 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009278 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009279 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9281 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009282 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009283 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285
9286 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009287 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 return v;
9289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290}
9291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009293fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 /* No need to call PyUnicode_READY(self) because this function is only
9296 called as a callback from fixup() which does it already. */
9297 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9298 const int kind = PyUnicode_KIND(self);
9299 void *data = PyUnicode_DATA(self);
9300 int touched = 0;
9301 Py_UCS4 maxchar = 0;
9302 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 for (i = 0; i < len; ++i) {
9305 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9306 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9307 if (up != ch) {
9308 if (up > maxchar)
9309 maxchar = up;
9310 PyUnicode_WRITE(kind, data, i, up);
9311 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 else if (ch > maxchar)
9314 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315 }
9316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 if (touched)
9318 return maxchar;
9319 else
9320 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321}
9322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009324fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9327 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9328 const int kind = PyUnicode_KIND(self);
9329 void *data = PyUnicode_DATA(self);
9330 int touched = 0;
9331 Py_UCS4 maxchar = 0;
9332 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 for(i = 0; i < len; ++i) {
9335 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9336 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9337 if (lo != ch) {
9338 if (lo > maxchar)
9339 maxchar = lo;
9340 PyUnicode_WRITE(kind, data, i, lo);
9341 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 else if (ch > maxchar)
9344 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 }
9346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 if (touched)
9348 return maxchar;
9349 else
9350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351}
9352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009354fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9357 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9358 const int kind = PyUnicode_KIND(self);
9359 void *data = PyUnicode_DATA(self);
9360 int touched = 0;
9361 Py_UCS4 maxchar = 0;
9362 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 for(i = 0; i < len; ++i) {
9365 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9366 Py_UCS4 nu = 0;
9367
9368 if (Py_UNICODE_ISUPPER(ch))
9369 nu = Py_UNICODE_TOLOWER(ch);
9370 else if (Py_UNICODE_ISLOWER(ch))
9371 nu = Py_UNICODE_TOUPPER(ch);
9372
9373 if (nu != 0) {
9374 if (nu > maxchar)
9375 maxchar = nu;
9376 PyUnicode_WRITE(kind, data, i, nu);
9377 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 else if (ch > maxchar)
9380 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
9382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383 if (touched)
9384 return maxchar;
9385 else
9386 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387}
9388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009390fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9393 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9394 const int kind = PyUnicode_KIND(self);
9395 void *data = PyUnicode_DATA(self);
9396 int touched = 0;
9397 Py_UCS4 maxchar = 0;
9398 Py_ssize_t i = 0;
9399 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009401 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403
9404 ch = PyUnicode_READ(kind, data, i);
9405 if (!Py_UNICODE_ISUPPER(ch)) {
9406 maxchar = Py_UNICODE_TOUPPER(ch);
9407 PyUnicode_WRITE(kind, data, i, maxchar);
9408 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 ++i;
9411 for(; i < len; ++i) {
9412 ch = PyUnicode_READ(kind, data, i);
9413 if (!Py_UNICODE_ISLOWER(ch)) {
9414 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9415 if (lo > maxchar)
9416 maxchar = lo;
9417 PyUnicode_WRITE(kind, data, i, lo);
9418 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 else if (ch > maxchar)
9421 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423
9424 if (touched)
9425 return maxchar;
9426 else
9427 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428}
9429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009431fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9434 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9435 const int kind = PyUnicode_KIND(self);
9436 void *data = PyUnicode_DATA(self);
9437 Py_UCS4 maxchar = 0;
9438 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 int previous_is_cased;
9440
9441 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 if (len == 1) {
9443 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9444 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9445 if (ti != ch) {
9446 PyUnicode_WRITE(kind, data, i, ti);
9447 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 }
9449 else
9450 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453 for(; i < len; ++i) {
9454 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9455 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009456
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 nu = Py_UNICODE_TOTITLE(ch);
9461
9462 if (nu > maxchar)
9463 maxchar = nu;
9464 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009465
Benjamin Peterson29060642009-01-31 22:14:21 +00009466 if (Py_UNICODE_ISLOWER(ch) ||
9467 Py_UNICODE_ISUPPER(ch) ||
9468 Py_UNICODE_ISTITLE(ch))
9469 previous_is_cased = 1;
9470 else
9471 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474}
9475
Tim Peters8ce9f162004-08-27 01:49:32 +00009476PyObject *
9477PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009480 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009482 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009483 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9484 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009485 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009487 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009489 int use_memcpy;
9490 unsigned char *res_data = NULL, *sep_data = NULL;
9491 PyObject *last_obj;
9492 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493
Tim Peters05eba1f2004-08-27 21:32:02 +00009494 fseq = PySequence_Fast(seq, "");
9495 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009496 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009497 }
9498
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009499 /* NOTE: the following code can't call back into Python code,
9500 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009501 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009502
Tim Peters05eba1f2004-08-27 21:32:02 +00009503 seqlen = PySequence_Fast_GET_SIZE(fseq);
9504 /* If empty sequence, return u"". */
9505 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009506 Py_DECREF(fseq);
9507 Py_INCREF(unicode_empty);
9508 res = unicode_empty;
9509 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009510 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009511
Tim Peters05eba1f2004-08-27 21:32:02 +00009512 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009513 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009514 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009515 if (seqlen == 1) {
9516 if (PyUnicode_CheckExact(items[0])) {
9517 res = items[0];
9518 Py_INCREF(res);
9519 Py_DECREF(fseq);
9520 return res;
9521 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009522 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009523 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009524 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009525 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009526 /* Set up sep and seplen */
9527 if (separator == NULL) {
9528 /* fall back to a blank space separator */
9529 sep = PyUnicode_FromOrdinal(' ');
9530 if (!sep)
9531 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009532 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009533 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009534 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009535 else {
9536 if (!PyUnicode_Check(separator)) {
9537 PyErr_Format(PyExc_TypeError,
9538 "separator: expected str instance,"
9539 " %.80s found",
9540 Py_TYPE(separator)->tp_name);
9541 goto onError;
9542 }
9543 if (PyUnicode_READY(separator))
9544 goto onError;
9545 sep = separator;
9546 seplen = PyUnicode_GET_LENGTH(separator);
9547 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9548 /* inc refcount to keep this code path symmetric with the
9549 above case of a blank separator */
9550 Py_INCREF(sep);
9551 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009552 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009553 }
9554
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 /* There are at least two things to join, or else we have a subclass
9556 * of str in the sequence.
9557 * Do a pre-pass to figure out the total amount of space we'll
9558 * need (sz), and see whether all argument are strings.
9559 */
9560 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009561#ifdef Py_DEBUG
9562 use_memcpy = 0;
9563#else
9564 use_memcpy = 1;
9565#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 for (i = 0; i < seqlen; i++) {
9567 const Py_ssize_t old_sz = sz;
9568 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 if (!PyUnicode_Check(item)) {
9570 PyErr_Format(PyExc_TypeError,
9571 "sequence item %zd: expected str instance,"
9572 " %.80s found",
9573 i, Py_TYPE(item)->tp_name);
9574 goto onError;
9575 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 if (PyUnicode_READY(item) == -1)
9577 goto onError;
9578 sz += PyUnicode_GET_LENGTH(item);
9579 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009580 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 if (i != 0)
9582 sz += seplen;
9583 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9584 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009586 goto onError;
9587 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009588 if (use_memcpy && last_obj != NULL) {
9589 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9590 use_memcpy = 0;
9591 }
9592 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009593 }
Tim Petersced69f82003-09-16 20:30:58 +00009594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 if (res == NULL)
9597 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009598
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009599 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009600#ifdef Py_DEBUG
9601 use_memcpy = 0;
9602#else
9603 if (use_memcpy) {
9604 res_data = PyUnicode_1BYTE_DATA(res);
9605 kind = PyUnicode_KIND(res);
9606 if (seplen != 0)
9607 sep_data = PyUnicode_1BYTE_DATA(sep);
9608 }
9609#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009611 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009612 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009614 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 if (use_memcpy) {
9616 Py_MEMCPY(res_data,
9617 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009618 kind * seplen);
9619 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 }
9621 else {
9622 copy_characters(res, res_offset, sep, 0, seplen);
9623 res_offset += seplen;
9624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009626 itemlen = PyUnicode_GET_LENGTH(item);
9627 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 if (use_memcpy) {
9629 Py_MEMCPY(res_data,
9630 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009631 kind * itemlen);
9632 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009633 }
9634 else {
9635 copy_characters(res, res_offset, item, 0, itemlen);
9636 res_offset += itemlen;
9637 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009638 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009639 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009640 if (use_memcpy)
9641 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009642 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009643 else
9644 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009645
Tim Peters05eba1f2004-08-27 21:32:02 +00009646 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009648 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650
Benjamin Peterson29060642009-01-31 22:14:21 +00009651 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009652 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009654 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655 return NULL;
9656}
9657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658#define FILL(kind, data, value, start, length) \
9659 do { \
9660 Py_ssize_t i_ = 0; \
9661 assert(kind != PyUnicode_WCHAR_KIND); \
9662 switch ((kind)) { \
9663 case PyUnicode_1BYTE_KIND: { \
9664 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9665 memset(to_, (unsigned char)value, length); \
9666 break; \
9667 } \
9668 case PyUnicode_2BYTE_KIND: { \
9669 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9670 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9671 break; \
9672 } \
9673 default: { \
9674 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9675 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9676 break; \
9677 } \
9678 } \
9679 } while (0)
9680
Victor Stinner9310abb2011-10-05 00:59:23 +02009681static PyObject *
9682pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009683 Py_ssize_t left,
9684 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 PyObject *u;
9688 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009689 int kind;
9690 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
9692 if (left < 0)
9693 left = 0;
9694 if (right < 0)
9695 right = 0;
9696
Tim Peters7a29bd52001-09-12 03:03:31 +00009697 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698 Py_INCREF(self);
9699 return self;
9700 }
9701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9703 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009704 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9705 return NULL;
9706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9708 if (fill > maxchar)
9709 maxchar = fill;
9710 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009711 if (!u)
9712 return NULL;
9713
9714 kind = PyUnicode_KIND(u);
9715 data = PyUnicode_DATA(u);
9716 if (left)
9717 FILL(kind, data, fill, 0, left);
9718 if (right)
9719 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009720 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009721 assert(_PyUnicode_CheckConsistency(u, 1));
9722 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009723}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725
Alexander Belopolsky40018472011-02-26 01:02:56 +00009726PyObject *
9727PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730
9731 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 switch(PyUnicode_KIND(string)) {
9736 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009737 if (PyUnicode_IS_ASCII(string))
9738 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009739 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009740 PyUnicode_GET_LENGTH(string), keepends);
9741 else
9742 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009743 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009744 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 break;
9746 case PyUnicode_2BYTE_KIND:
9747 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009748 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 PyUnicode_GET_LENGTH(string), keepends);
9750 break;
9751 case PyUnicode_4BYTE_KIND:
9752 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009753 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754 PyUnicode_GET_LENGTH(string), keepends);
9755 break;
9756 default:
9757 assert(0);
9758 list = 0;
9759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 Py_DECREF(string);
9761 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762}
9763
Alexander Belopolsky40018472011-02-26 01:02:56 +00009764static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009765split(PyObject *self,
9766 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009767 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 int kind1, kind2, kind;
9770 void *buf1, *buf2;
9771 Py_ssize_t len1, len2;
9772 PyObject* out;
9773
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009775 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 if (PyUnicode_READY(self) == -1)
9778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (substring == NULL)
9781 switch(PyUnicode_KIND(self)) {
9782 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009783 if (PyUnicode_IS_ASCII(self))
9784 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009785 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009786 PyUnicode_GET_LENGTH(self), maxcount
9787 );
9788 else
9789 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009790 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009791 PyUnicode_GET_LENGTH(self), maxcount
9792 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 case PyUnicode_2BYTE_KIND:
9794 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009795 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 PyUnicode_GET_LENGTH(self), maxcount
9797 );
9798 case PyUnicode_4BYTE_KIND:
9799 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009800 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 PyUnicode_GET_LENGTH(self), maxcount
9802 );
9803 default:
9804 assert(0);
9805 return NULL;
9806 }
9807
9808 if (PyUnicode_READY(substring) == -1)
9809 return NULL;
9810
9811 kind1 = PyUnicode_KIND(self);
9812 kind2 = PyUnicode_KIND(substring);
9813 kind = kind1 > kind2 ? kind1 : kind2;
9814 buf1 = PyUnicode_DATA(self);
9815 buf2 = PyUnicode_DATA(substring);
9816 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009817 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 if (!buf1)
9819 return NULL;
9820 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009821 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (!buf2) {
9823 if (kind1 != kind) PyMem_Free(buf1);
9824 return NULL;
9825 }
9826 len1 = PyUnicode_GET_LENGTH(self);
9827 len2 = PyUnicode_GET_LENGTH(substring);
9828
9829 switch(kind) {
9830 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009831 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9832 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 else
9835 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009836 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 break;
9838 case PyUnicode_2BYTE_KIND:
9839 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009840 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 break;
9842 case PyUnicode_4BYTE_KIND:
9843 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009844 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 break;
9846 default:
9847 out = NULL;
9848 }
9849 if (kind1 != kind)
9850 PyMem_Free(buf1);
9851 if (kind2 != kind)
9852 PyMem_Free(buf2);
9853 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854}
9855
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009857rsplit(PyObject *self,
9858 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009859 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 int kind1, kind2, kind;
9862 void *buf1, *buf2;
9863 Py_ssize_t len1, len2;
9864 PyObject* out;
9865
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009866 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009867 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 if (PyUnicode_READY(self) == -1)
9870 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 if (substring == NULL)
9873 switch(PyUnicode_KIND(self)) {
9874 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009875 if (PyUnicode_IS_ASCII(self))
9876 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009878 PyUnicode_GET_LENGTH(self), maxcount
9879 );
9880 else
9881 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009882 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009883 PyUnicode_GET_LENGTH(self), maxcount
9884 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 case PyUnicode_2BYTE_KIND:
9886 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009887 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 PyUnicode_GET_LENGTH(self), maxcount
9889 );
9890 case PyUnicode_4BYTE_KIND:
9891 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009892 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 PyUnicode_GET_LENGTH(self), maxcount
9894 );
9895 default:
9896 assert(0);
9897 return NULL;
9898 }
9899
9900 if (PyUnicode_READY(substring) == -1)
9901 return NULL;
9902
9903 kind1 = PyUnicode_KIND(self);
9904 kind2 = PyUnicode_KIND(substring);
9905 kind = kind1 > kind2 ? kind1 : kind2;
9906 buf1 = PyUnicode_DATA(self);
9907 buf2 = PyUnicode_DATA(substring);
9908 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009909 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (!buf1)
9911 return NULL;
9912 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 if (!buf2) {
9915 if (kind1 != kind) PyMem_Free(buf1);
9916 return NULL;
9917 }
9918 len1 = PyUnicode_GET_LENGTH(self);
9919 len2 = PyUnicode_GET_LENGTH(substring);
9920
9921 switch(kind) {
9922 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9924 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009926 else
9927 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 break;
9930 case PyUnicode_2BYTE_KIND:
9931 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009932 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 break;
9934 case PyUnicode_4BYTE_KIND:
9935 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009936 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 break;
9938 default:
9939 out = NULL;
9940 }
9941 if (kind1 != kind)
9942 PyMem_Free(buf1);
9943 if (kind2 != kind)
9944 PyMem_Free(buf2);
9945 return out;
9946}
9947
9948static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9950 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951{
9952 switch(kind) {
9953 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009954 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9955 return asciilib_find(buf1, len1, buf2, len2, offset);
9956 else
9957 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 case PyUnicode_2BYTE_KIND:
9959 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9960 case PyUnicode_4BYTE_KIND:
9961 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9962 }
9963 assert(0);
9964 return -1;
9965}
9966
9967static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9969 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970{
9971 switch(kind) {
9972 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009973 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9974 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9975 else
9976 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 case PyUnicode_2BYTE_KIND:
9978 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9979 case PyUnicode_4BYTE_KIND:
9980 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9981 }
9982 assert(0);
9983 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009984}
9985
Alexander Belopolsky40018472011-02-26 01:02:56 +00009986static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987replace(PyObject *self, PyObject *str1,
9988 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 PyObject *u;
9991 char *sbuf = PyUnicode_DATA(self);
9992 char *buf1 = PyUnicode_DATA(str1);
9993 char *buf2 = PyUnicode_DATA(str2);
9994 int srelease = 0, release1 = 0, release2 = 0;
9995 int skind = PyUnicode_KIND(self);
9996 int kind1 = PyUnicode_KIND(str1);
9997 int kind2 = PyUnicode_KIND(str2);
9998 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9999 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10000 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010001 int mayshrink;
10002 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003
10004 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010007 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
Victor Stinner59de0ee2011-10-07 10:01:28 +020010009 if (str1 == str2)
10010 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (skind < kind1)
10012 /* substring too wide to be present */
10013 goto nothing;
10014
Victor Stinner49a0a212011-10-12 23:46:10 +020010015 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10016 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10017 /* Replacing str1 with str2 may cause a maxchar reduction in the
10018 result string. */
10019 mayshrink = (maxchar_str2 < maxchar);
10020 maxchar = Py_MAX(maxchar, maxchar_str2);
10021
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010023 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010024 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010026 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010028 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010029 Py_UCS4 u1, u2;
10030 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010032 if (findchar(sbuf, PyUnicode_KIND(self),
10033 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010034 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010037 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010039 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 rkind = PyUnicode_KIND(u);
10041 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10042 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010043 if (--maxcount < 0)
10044 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010046 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010047 }
10048 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 int rkind = skind;
10050 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (kind1 < rkind) {
10053 /* widen substring */
10054 buf1 = _PyUnicode_AsKind(str1, rkind);
10055 if (!buf1) goto error;
10056 release1 = 1;
10057 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059 if (i < 0)
10060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (rkind > kind2) {
10062 /* widen replacement */
10063 buf2 = _PyUnicode_AsKind(str2, rkind);
10064 if (!buf2) goto error;
10065 release2 = 1;
10066 }
10067 else if (rkind < kind2) {
10068 /* widen self and buf1 */
10069 rkind = kind2;
10070 if (release1) PyMem_Free(buf1);
10071 sbuf = _PyUnicode_AsKind(self, rkind);
10072 if (!sbuf) goto error;
10073 srelease = 1;
10074 buf1 = _PyUnicode_AsKind(str1, rkind);
10075 if (!buf1) goto error;
10076 release1 = 1;
10077 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010078 u = PyUnicode_New(slen, maxchar);
10079 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010081 assert(PyUnicode_KIND(u) == rkind);
10082 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010083
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010085 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010088 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010090
10091 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010092 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010093 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010094 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010095 if (i == -1)
10096 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010097 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010099 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010102 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010103 }
10104 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 Py_ssize_t n, i, j, ires;
10106 Py_ssize_t product, new_size;
10107 int rkind = skind;
10108 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010111 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 buf1 = _PyUnicode_AsKind(str1, rkind);
10113 if (!buf1) goto error;
10114 release1 = 1;
10115 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010116 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117 if (n == 0)
10118 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010120 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 buf2 = _PyUnicode_AsKind(str2, rkind);
10122 if (!buf2) goto error;
10123 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 rkind = kind2;
10128 sbuf = _PyUnicode_AsKind(self, rkind);
10129 if (!sbuf) goto error;
10130 srelease = 1;
10131 if (release1) PyMem_Free(buf1);
10132 buf1 = _PyUnicode_AsKind(str1, rkind);
10133 if (!buf1) goto error;
10134 release1 = 1;
10135 }
10136 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10137 PyUnicode_GET_LENGTH(str1))); */
10138 product = n * (len2-len1);
10139 if ((product / (len2-len1)) != n) {
10140 PyErr_SetString(PyExc_OverflowError,
10141 "replace string is too long");
10142 goto error;
10143 }
10144 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 if (new_size == 0) {
10146 Py_INCREF(unicode_empty);
10147 u = unicode_empty;
10148 goto done;
10149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10151 PyErr_SetString(PyExc_OverflowError,
10152 "replace string is too long");
10153 goto error;
10154 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010155 u = PyUnicode_New(new_size, maxchar);
10156 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010158 assert(PyUnicode_KIND(u) == rkind);
10159 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 ires = i = 0;
10161 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 while (n-- > 0) {
10163 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010165 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010166 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010167 if (j == -1)
10168 break;
10169 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010171 memcpy(res + rkind * ires,
10172 sbuf + rkind * i,
10173 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 }
10176 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010178 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010180 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010186 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010187 memcpy(res + rkind * ires,
10188 sbuf + rkind * i,
10189 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010190 }
10191 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 /* interleave */
10193 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010194 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010196 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010198 if (--n <= 0)
10199 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010200 memcpy(res + rkind * ires,
10201 sbuf + rkind * i,
10202 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 ires++;
10204 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010205 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010206 memcpy(res + rkind * ires,
10207 sbuf + rkind * i,
10208 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010209 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 }
10211
10212 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010213 unicode_adjust_maxchar(&u);
10214 if (u == NULL)
10215 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010217
10218 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (srelease)
10220 PyMem_FREE(sbuf);
10221 if (release1)
10222 PyMem_FREE(buf1);
10223 if (release2)
10224 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010225 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010227
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010229 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (srelease)
10231 PyMem_FREE(sbuf);
10232 if (release1)
10233 PyMem_FREE(buf1);
10234 if (release2)
10235 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 if (PyUnicode_CheckExact(self)) {
10237 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010238 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010239 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010240 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 error:
10242 if (srelease && sbuf)
10243 PyMem_FREE(sbuf);
10244 if (release1 && buf1)
10245 PyMem_FREE(buf1);
10246 if (release2 && buf2)
10247 PyMem_FREE(buf2);
10248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249}
10250
10251/* --- Unicode Object Methods --------------------------------------------- */
10252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010253PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255\n\
10256Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010257characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
10259static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010260unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 return fixup(self, fixtitle);
10263}
10264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010265PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010266 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267\n\
10268Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010269have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270
10271static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010272unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 return fixup(self, fixcapitalize);
10275}
10276
10277#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010278PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010279 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280\n\
10281Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010282normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
10284static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010285unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286{
10287 PyObject *list;
10288 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010289 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 /* Split into words */
10292 list = split(self, NULL, -1);
10293 if (!list)
10294 return NULL;
10295
10296 /* Capitalize each word */
10297 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010298 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010299 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 if (item == NULL)
10301 goto onError;
10302 Py_DECREF(PyList_GET_ITEM(list, i));
10303 PyList_SET_ITEM(list, i, item);
10304 }
10305
10306 /* Join the words to form a new string */
10307 item = PyUnicode_Join(NULL, list);
10308
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313#endif
10314
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010315/* Argument converter. Coerces to a single unicode character */
10316
10317static int
10318convert_uc(PyObject *obj, void *addr)
10319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010321 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010322
Benjamin Peterson14339b62009-01-31 16:36:08 +000010323 uniobj = PyUnicode_FromObject(obj);
10324 if (uniobj == NULL) {
10325 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010326 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010327 return 0;
10328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010330 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010331 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010332 Py_DECREF(uniobj);
10333 return 0;
10334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010336 Py_DECREF(uniobj);
10337 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010338}
10339
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010340PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010341 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010343Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010344done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345
10346static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010347unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010349 Py_ssize_t marg, left;
10350 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 fillchar = ' ';
10352
Victor Stinnere9a29352011-10-01 02:14:59 +020010353 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Victor Stinnere9a29352011-10-01 02:14:59 +020010356 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 return NULL;
10358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010361 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 }
10363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 left = marg / 2 + (marg & width & 1);
10366
Victor Stinner9310abb2011-10-05 00:59:23 +020010367 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368}
10369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370/* This function assumes that str1 and str2 are readied by the caller. */
10371
Marc-André Lemburge5034372000-08-08 08:04:29 +000010372static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010373unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 int kind1, kind2;
10376 void *data1, *data2;
10377 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 kind1 = PyUnicode_KIND(str1);
10380 kind2 = PyUnicode_KIND(str2);
10381 data1 = PyUnicode_DATA(str1);
10382 data2 = PyUnicode_DATA(str2);
10383 len1 = PyUnicode_GET_LENGTH(str1);
10384 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 for (i = 0; i < len1 && i < len2; ++i) {
10387 Py_UCS4 c1, c2;
10388 c1 = PyUnicode_READ(kind1, data1, i);
10389 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010390
10391 if (c1 != c2)
10392 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010393 }
10394
10395 return (len1 < len2) ? -1 : (len1 != len2);
10396}
10397
Alexander Belopolsky40018472011-02-26 01:02:56 +000010398int
10399PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10402 if (PyUnicode_READY(left) == -1 ||
10403 PyUnicode_READY(right) == -1)
10404 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010405 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010407 PyErr_Format(PyExc_TypeError,
10408 "Can't compare %.100s and %.100s",
10409 left->ob_type->tp_name,
10410 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411 return -1;
10412}
10413
Martin v. Löwis5b222132007-06-10 09:51:05 +000010414int
10415PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 Py_ssize_t i;
10418 int kind;
10419 void *data;
10420 Py_UCS4 chr;
10421
Victor Stinner910337b2011-10-03 03:20:16 +020010422 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 if (PyUnicode_READY(uni) == -1)
10424 return -1;
10425 kind = PyUnicode_KIND(uni);
10426 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010427 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10429 if (chr != str[i])
10430 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010431 /* This check keeps Python strings that end in '\0' from comparing equal
10432 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010434 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010435 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010437 return 0;
10438}
10439
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010440
Benjamin Peterson29060642009-01-31 22:14:21 +000010441#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010442 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010443
Alexander Belopolsky40018472011-02-26 01:02:56 +000010444PyObject *
10445PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010446{
10447 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010448
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010449 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10450 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 if (PyUnicode_READY(left) == -1 ||
10452 PyUnicode_READY(right) == -1)
10453 return NULL;
10454 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10455 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010456 if (op == Py_EQ) {
10457 Py_INCREF(Py_False);
10458 return Py_False;
10459 }
10460 if (op == Py_NE) {
10461 Py_INCREF(Py_True);
10462 return Py_True;
10463 }
10464 }
10465 if (left == right)
10466 result = 0;
10467 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010468 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010470 /* Convert the return value to a Boolean */
10471 switch (op) {
10472 case Py_EQ:
10473 v = TEST_COND(result == 0);
10474 break;
10475 case Py_NE:
10476 v = TEST_COND(result != 0);
10477 break;
10478 case Py_LE:
10479 v = TEST_COND(result <= 0);
10480 break;
10481 case Py_GE:
10482 v = TEST_COND(result >= 0);
10483 break;
10484 case Py_LT:
10485 v = TEST_COND(result == -1);
10486 break;
10487 case Py_GT:
10488 v = TEST_COND(result == 1);
10489 break;
10490 default:
10491 PyErr_BadArgument();
10492 return NULL;
10493 }
10494 Py_INCREF(v);
10495 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010497
Brian Curtindfc80e32011-08-10 20:28:54 -050010498 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010499}
10500
Alexander Belopolsky40018472011-02-26 01:02:56 +000010501int
10502PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010503{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 int kind1, kind2, kind;
10506 void *buf1, *buf2;
10507 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010508 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010509
10510 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010511 sub = PyUnicode_FromObject(element);
10512 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 PyErr_Format(PyExc_TypeError,
10514 "'in <string>' requires string as left operand, not %s",
10515 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (PyUnicode_READY(sub) == -1)
10519 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010520
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010522 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 Py_DECREF(sub);
10524 return -1;
10525 }
10526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 kind1 = PyUnicode_KIND(str);
10528 kind2 = PyUnicode_KIND(sub);
10529 kind = kind1 > kind2 ? kind1 : kind2;
10530 buf1 = PyUnicode_DATA(str);
10531 buf2 = PyUnicode_DATA(sub);
10532 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010533 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (!buf1) {
10535 Py_DECREF(sub);
10536 return -1;
10537 }
10538 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010539 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 if (!buf2) {
10541 Py_DECREF(sub);
10542 if (kind1 != kind) PyMem_Free(buf1);
10543 return -1;
10544 }
10545 len1 = PyUnicode_GET_LENGTH(str);
10546 len2 = PyUnicode_GET_LENGTH(sub);
10547
10548 switch(kind) {
10549 case PyUnicode_1BYTE_KIND:
10550 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10551 break;
10552 case PyUnicode_2BYTE_KIND:
10553 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10554 break;
10555 case PyUnicode_4BYTE_KIND:
10556 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10557 break;
10558 default:
10559 result = -1;
10560 assert(0);
10561 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010562
10563 Py_DECREF(str);
10564 Py_DECREF(sub);
10565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 if (kind1 != kind)
10567 PyMem_Free(buf1);
10568 if (kind2 != kind)
10569 PyMem_Free(buf2);
10570
Guido van Rossum403d68b2000-03-13 15:55:09 +000010571 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010572}
10573
Guido van Rossumd57fd912000-03-10 22:53:23 +000010574/* Concat to string or Unicode object giving a new Unicode object. */
10575
Alexander Belopolsky40018472011-02-26 01:02:56 +000010576PyObject *
10577PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010580 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581
10582 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
10590 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010591 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010595 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010596 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598 }
10599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010601 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10602 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 w = PyUnicode_New(
10606 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10607 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010609 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010610 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10611 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612 Py_DECREF(u);
10613 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010614 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618 Py_XDECREF(u);
10619 Py_XDECREF(v);
10620 return NULL;
10621}
10622
Victor Stinnerb0923652011-10-04 01:17:31 +020010623static void
10624unicode_append_inplace(PyObject **p_left, PyObject *right)
10625{
10626 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010627
10628 assert(PyUnicode_IS_READY(*p_left));
10629 assert(PyUnicode_IS_READY(right));
10630
10631 left_len = PyUnicode_GET_LENGTH(*p_left);
10632 right_len = PyUnicode_GET_LENGTH(right);
10633 if (left_len > PY_SSIZE_T_MAX - right_len) {
10634 PyErr_SetString(PyExc_OverflowError,
10635 "strings are too large to concat");
10636 goto error;
10637 }
10638 new_len = left_len + right_len;
10639
10640 /* Now we own the last reference to 'left', so we can resize it
10641 * in-place.
10642 */
10643 if (unicode_resize(p_left, new_len) != 0) {
10644 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10645 * deallocated so it cannot be put back into
10646 * 'variable'. The MemoryError is raised when there
10647 * is no value in 'variable', which might (very
10648 * remotely) be a cause of incompatibilities.
10649 */
10650 goto error;
10651 }
10652 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010653 copy_characters(*p_left, left_len, right, 0, right_len);
10654 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010655 return;
10656
10657error:
10658 Py_DECREF(*p_left);
10659 *p_left = NULL;
10660}
10661
Walter Dörwald1ab83302007-05-18 17:15:44 +000010662void
Victor Stinner23e56682011-10-03 03:54:37 +020010663PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010664{
Victor Stinner23e56682011-10-03 03:54:37 +020010665 PyObject *left, *res;
10666
10667 if (p_left == NULL) {
10668 if (!PyErr_Occurred())
10669 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010670 return;
10671 }
Victor Stinner23e56682011-10-03 03:54:37 +020010672 left = *p_left;
10673 if (right == NULL || !PyUnicode_Check(left)) {
10674 if (!PyErr_Occurred())
10675 PyErr_BadInternalCall();
10676 goto error;
10677 }
10678
Victor Stinnere1335c72011-10-04 20:53:03 +020010679 if (PyUnicode_READY(left))
10680 goto error;
10681 if (PyUnicode_READY(right))
10682 goto error;
10683
Victor Stinner23e56682011-10-03 03:54:37 +020010684 if (PyUnicode_CheckExact(left) && left != unicode_empty
10685 && PyUnicode_CheckExact(right) && right != unicode_empty
10686 && unicode_resizable(left)
10687 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10688 || _PyUnicode_WSTR(left) != NULL))
10689 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010690 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10691 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010692 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010693 not so different than duplicating the string. */
10694 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010695 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010696 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010697 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010698 return;
10699 }
10700 }
10701
10702 res = PyUnicode_Concat(left, right);
10703 if (res == NULL)
10704 goto error;
10705 Py_DECREF(left);
10706 *p_left = res;
10707 return;
10708
10709error:
10710 Py_DECREF(*p_left);
10711 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010712}
10713
10714void
10715PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10716{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010717 PyUnicode_Append(pleft, right);
10718 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010719}
10720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010725string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010726interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
10728static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010729unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010731 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010732 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010733 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 int kind1, kind2, kind;
10736 void *buf1, *buf2;
10737 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738
Jesus Ceaac451502011-04-20 17:09:23 +020010739 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10740 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010741 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 kind1 = PyUnicode_KIND(self);
10744 kind2 = PyUnicode_KIND(substring);
10745 kind = kind1 > kind2 ? kind1 : kind2;
10746 buf1 = PyUnicode_DATA(self);
10747 buf2 = PyUnicode_DATA(substring);
10748 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010749 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (!buf1) {
10751 Py_DECREF(substring);
10752 return NULL;
10753 }
10754 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010755 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 if (!buf2) {
10757 Py_DECREF(substring);
10758 if (kind1 != kind) PyMem_Free(buf1);
10759 return NULL;
10760 }
10761 len1 = PyUnicode_GET_LENGTH(self);
10762 len2 = PyUnicode_GET_LENGTH(substring);
10763
10764 ADJUST_INDICES(start, end, len1);
10765 switch(kind) {
10766 case PyUnicode_1BYTE_KIND:
10767 iresult = ucs1lib_count(
10768 ((Py_UCS1*)buf1) + start, end - start,
10769 buf2, len2, PY_SSIZE_T_MAX
10770 );
10771 break;
10772 case PyUnicode_2BYTE_KIND:
10773 iresult = ucs2lib_count(
10774 ((Py_UCS2*)buf1) + start, end - start,
10775 buf2, len2, PY_SSIZE_T_MAX
10776 );
10777 break;
10778 case PyUnicode_4BYTE_KIND:
10779 iresult = ucs4lib_count(
10780 ((Py_UCS4*)buf1) + start, end - start,
10781 buf2, len2, PY_SSIZE_T_MAX
10782 );
10783 break;
10784 default:
10785 assert(0); iresult = 0;
10786 }
10787
10788 result = PyLong_FromSsize_t(iresult);
10789
10790 if (kind1 != kind)
10791 PyMem_Free(buf1);
10792 if (kind2 != kind)
10793 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794
10795 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797 return result;
10798}
10799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010800PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010801 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010803Encode S using the codec registered for encoding. Default encoding\n\
10804is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010805handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010806a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10807'xmlcharrefreplace' as well as any other name registered with\n\
10808codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809
10810static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010811unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010813 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 char *encoding = NULL;
10815 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010816
Benjamin Peterson308d6372009-09-18 21:42:35 +000010817 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10818 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010821}
10822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010823PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010824 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825\n\
10826Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010827If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
10829static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010830unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010832 Py_ssize_t i, j, line_pos, src_len, incr;
10833 Py_UCS4 ch;
10834 PyObject *u;
10835 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010837 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010838 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Antoine Pitrou22425222011-10-04 19:10:51 +020010843 if (PyUnicode_READY(self) == -1)
10844 return NULL;
10845
Thomas Wouters7e474022000-07-16 12:04:32 +000010846 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010847 src_len = PyUnicode_GET_LENGTH(self);
10848 i = j = line_pos = 0;
10849 kind = PyUnicode_KIND(self);
10850 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010851 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010852 for (; i < src_len; i++) {
10853 ch = PyUnicode_READ(kind, src_data, i);
10854 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010855 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010858 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010859 goto overflow;
10860 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010862 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010863 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010866 goto overflow;
10867 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010869 if (ch == '\n' || ch == '\r')
10870 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010873 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010874 Py_INCREF(self);
10875 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010877
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010879 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880 if (!u)
10881 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
Antoine Pitroue71d5742011-10-04 15:55:09 +020010886 for (; i < src_len; i++) {
10887 ch = PyUnicode_READ(kind, src_data, i);
10888 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 incr = tabsize - (line_pos % tabsize);
10891 line_pos += incr;
10892 while (incr--) {
10893 PyUnicode_WRITE(kind, dest_data, j, ' ');
10894 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010895 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010896 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010897 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 line_pos++;
10900 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010901 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 if (ch == '\n' || ch == '\r')
10903 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 }
10906 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010907 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010910 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912}
10913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010914PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916\n\
10917Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010918such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919arguments start and end are interpreted as in slice notation.\n\
10920\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010921Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922
10923static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010926 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010927 Py_ssize_t start;
10928 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930
Jesus Ceaac451502011-04-20 17:09:23 +020010931 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10932 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (PyUnicode_READY(self) == -1)
10936 return NULL;
10937 if (PyUnicode_READY(substring) == -1)
10938 return NULL;
10939
Victor Stinner7931d9a2011-11-04 00:22:48 +010010940 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
10942 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (result == -2)
10945 return NULL;
10946
Christian Heimes217cfd12007-12-02 14:31:20 +000010947 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948}
10949
10950static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010951unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010953 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10954 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957}
10958
Guido van Rossumc2504932007-09-18 19:42:40 +000010959/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010960 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010961static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010962unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
Guido van Rossumc2504932007-09-18 19:42:40 +000010964 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010965 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (_PyUnicode_HASH(self) != -1)
10968 return _PyUnicode_HASH(self);
10969 if (PyUnicode_READY(self) == -1)
10970 return -1;
10971 len = PyUnicode_GET_LENGTH(self);
10972
10973 /* The hash function as a macro, gets expanded three times below. */
10974#define HASH(P) \
10975 x = (Py_uhash_t)*P << 7; \
10976 while (--len >= 0) \
10977 x = (1000003*x) ^ (Py_uhash_t)*P++;
10978
10979 switch (PyUnicode_KIND(self)) {
10980 case PyUnicode_1BYTE_KIND: {
10981 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10982 HASH(c);
10983 break;
10984 }
10985 case PyUnicode_2BYTE_KIND: {
10986 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10987 HASH(s);
10988 break;
10989 }
10990 default: {
10991 Py_UCS4 *l;
10992 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10993 "Impossible switch case in unicode_hash");
10994 l = PyUnicode_4BYTE_DATA(self);
10995 HASH(l);
10996 break;
10997 }
10998 }
10999 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11000
Guido van Rossumc2504932007-09-18 19:42:40 +000011001 if (x == -1)
11002 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011004 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011008PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011011Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012
11013static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011016 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011017 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011018 Py_ssize_t start;
11019 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
Jesus Ceaac451502011-04-20 17:09:23 +020011021 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11022 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 if (PyUnicode_READY(self) == -1)
11026 return NULL;
11027 if (PyUnicode_READY(substring) == -1)
11028 return NULL;
11029
Victor Stinner7931d9a2011-11-04 00:22:48 +010011030 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
11032 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 if (result == -2)
11035 return NULL;
11036
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 if (result < 0) {
11038 PyErr_SetString(PyExc_ValueError, "substring not found");
11039 return NULL;
11040 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041
Christian Heimes217cfd12007-12-02 14:31:20 +000011042 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043}
11044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011045PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011048Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011049at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050
11051static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011052unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 Py_ssize_t i, length;
11055 int kind;
11056 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 int cased;
11058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 if (PyUnicode_READY(self) == -1)
11060 return NULL;
11061 length = PyUnicode_GET_LENGTH(self);
11062 kind = PyUnicode_KIND(self);
11063 data = PyUnicode_DATA(self);
11064
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 if (length == 1)
11067 return PyBool_FromLong(
11068 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011070 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011072 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011073
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011075 for (i = 0; i < length; i++) {
11076 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011077
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11079 return PyBool_FromLong(0);
11080 else if (!cased && Py_UNICODE_ISLOWER(ch))
11081 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011083 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084}
11085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011086PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011087 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011089Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011090at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
11092static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011093unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 Py_ssize_t i, length;
11096 int kind;
11097 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 int cased;
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (PyUnicode_READY(self) == -1)
11101 return NULL;
11102 length = PyUnicode_GET_LENGTH(self);
11103 kind = PyUnicode_KIND(self);
11104 data = PyUnicode_DATA(self);
11105
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 if (length == 1)
11108 return PyBool_FromLong(
11109 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011111 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011114
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 for (i = 0; i < length; i++) {
11117 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011118
Benjamin Peterson29060642009-01-31 22:14:21 +000011119 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11120 return PyBool_FromLong(0);
11121 else if (!cased && Py_UNICODE_ISUPPER(ch))
11122 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011124 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125}
11126
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011127PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011128 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011130Return True if S is a titlecased string and there is at least one\n\
11131character in S, i.e. upper- and titlecase characters may only\n\
11132follow uncased characters and lowercase characters only cased ones.\n\
11133Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
11135static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011136unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 Py_ssize_t i, length;
11139 int kind;
11140 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141 int cased, previous_is_cased;
11142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 if (PyUnicode_READY(self) == -1)
11144 return NULL;
11145 length = PyUnicode_GET_LENGTH(self);
11146 kind = PyUnicode_KIND(self);
11147 data = PyUnicode_DATA(self);
11148
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if (length == 1) {
11151 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11152 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11153 (Py_UNICODE_ISUPPER(ch) != 0));
11154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011156 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011159
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160 cased = 0;
11161 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 for (i = 0; i < length; i++) {
11163 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011164
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11166 if (previous_is_cased)
11167 return PyBool_FromLong(0);
11168 previous_is_cased = 1;
11169 cased = 1;
11170 }
11171 else if (Py_UNICODE_ISLOWER(ch)) {
11172 if (!previous_is_cased)
11173 return PyBool_FromLong(0);
11174 previous_is_cased = 1;
11175 cased = 1;
11176 }
11177 else
11178 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011180 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181}
11182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011183PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011186Return True if all characters in S are whitespace\n\
11187and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
11189static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011190unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 Py_ssize_t i, length;
11193 int kind;
11194 void *data;
11195
11196 if (PyUnicode_READY(self) == -1)
11197 return NULL;
11198 length = PyUnicode_GET_LENGTH(self);
11199 kind = PyUnicode_KIND(self);
11200 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 if (length == 1)
11204 return PyBool_FromLong(
11205 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011207 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 for (i = 0; i < length; i++) {
11212 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011213 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011216 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217}
11218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011221\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011222Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224
11225static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011226unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 Py_ssize_t i, length;
11229 int kind;
11230 void *data;
11231
11232 if (PyUnicode_READY(self) == -1)
11233 return NULL;
11234 length = PyUnicode_GET_LENGTH(self);
11235 kind = PyUnicode_KIND(self);
11236 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011237
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011238 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 if (length == 1)
11240 return PyBool_FromLong(
11241 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011242
11243 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 for (i = 0; i < length; i++) {
11248 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011250 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011251 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011252}
11253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011254PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011256\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011257Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011259
11260static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011261unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 int kind;
11264 void *data;
11265 Py_ssize_t len, i;
11266
11267 if (PyUnicode_READY(self) == -1)
11268 return NULL;
11269
11270 kind = PyUnicode_KIND(self);
11271 data = PyUnicode_DATA(self);
11272 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011273
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011275 if (len == 1) {
11276 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11277 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11278 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011279
11280 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 for (i = 0; i < len; i++) {
11285 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011286 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011288 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011289 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011290}
11291
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011292PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011295Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011296False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297
11298static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011299unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 Py_ssize_t i, length;
11302 int kind;
11303 void *data;
11304
11305 if (PyUnicode_READY(self) == -1)
11306 return NULL;
11307 length = PyUnicode_GET_LENGTH(self);
11308 kind = PyUnicode_KIND(self);
11309 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011312 if (length == 1)
11313 return PyBool_FromLong(
11314 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011316 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 for (i = 0; i < length; i++) {
11321 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011324 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325}
11326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011327PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011328 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011330Return True if all characters in S are digits\n\
11331and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
11333static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011334unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 Py_ssize_t i, length;
11337 int kind;
11338 void *data;
11339
11340 if (PyUnicode_READY(self) == -1)
11341 return NULL;
11342 length = PyUnicode_GET_LENGTH(self);
11343 kind = PyUnicode_KIND(self);
11344 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 if (length == 1) {
11348 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11349 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011352 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 for (i = 0; i < length; i++) {
11357 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011360 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361}
11362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011363PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011366Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011367False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
11369static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011370unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 Py_ssize_t i, length;
11373 int kind;
11374 void *data;
11375
11376 if (PyUnicode_READY(self) == -1)
11377 return NULL;
11378 length = PyUnicode_GET_LENGTH(self);
11379 kind = PyUnicode_KIND(self);
11380 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 if (length == 1)
11384 return PyBool_FromLong(
11385 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011387 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 for (i = 0; i < length; i++) {
11392 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011395 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011396}
11397
Martin v. Löwis47383402007-08-15 07:32:56 +000011398int
11399PyUnicode_IsIdentifier(PyObject *self)
11400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 int kind;
11402 void *data;
11403 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011404 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011405
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (PyUnicode_READY(self) == -1) {
11407 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 }
11410
11411 /* Special case for empty strings */
11412 if (PyUnicode_GET_LENGTH(self) == 0)
11413 return 0;
11414 kind = PyUnicode_KIND(self);
11415 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011416
11417 /* PEP 3131 says that the first character must be in
11418 XID_Start and subsequent characters in XID_Continue,
11419 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011420 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011421 letters, digits, underscore). However, given the current
11422 definition of XID_Start and XID_Continue, it is sufficient
11423 to check just for these, except that _ must be allowed
11424 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011426 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011427 return 0;
11428
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011429 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011432 return 1;
11433}
11434
11435PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011437\n\
11438Return True if S is a valid identifier according\n\
11439to the language definition.");
11440
11441static PyObject*
11442unicode_isidentifier(PyObject *self)
11443{
11444 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11445}
11446
Georg Brandl559e5d72008-06-11 18:37:52 +000011447PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011449\n\
11450Return True if all characters in S are considered\n\
11451printable in repr() or S is empty, False otherwise.");
11452
11453static PyObject*
11454unicode_isprintable(PyObject *self)
11455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 Py_ssize_t i, length;
11457 int kind;
11458 void *data;
11459
11460 if (PyUnicode_READY(self) == -1)
11461 return NULL;
11462 length = PyUnicode_GET_LENGTH(self);
11463 kind = PyUnicode_KIND(self);
11464 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011465
11466 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 if (length == 1)
11468 return PyBool_FromLong(
11469 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 for (i = 0; i < length; i++) {
11472 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011473 Py_RETURN_FALSE;
11474 }
11475 }
11476 Py_RETURN_TRUE;
11477}
11478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011480 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481\n\
11482Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011483iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
11485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011486unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011488 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489}
11490
Martin v. Löwis18e16552006-02-15 17:27:45 +000011491static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011492unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (PyUnicode_READY(self) == -1)
11495 return -1;
11496 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497}
11498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011499PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011502Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011503done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
11505static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011506unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011508 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 Py_UCS4 fillchar = ' ';
11510
11511 if (PyUnicode_READY(self) == -1)
11512 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011513
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011514 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 return NULL;
11516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011519 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 }
11521
Victor Stinner7931d9a2011-11-04 00:22:48 +010011522 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011528Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529
11530static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011531unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 return fixup(self, fixlower);
11534}
11535
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011536#define LEFTSTRIP 0
11537#define RIGHTSTRIP 1
11538#define BOTHSTRIP 2
11539
11540/* Arrays indexed by above */
11541static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11542
11543#define STRIPNAME(i) (stripformat[i]+3)
11544
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011545/* externally visible for str.strip(unicode) */
11546PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011547_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 void *data;
11550 int kind;
11551 Py_ssize_t i, j, len;
11552 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11555 return NULL;
11556
11557 kind = PyUnicode_KIND(self);
11558 data = PyUnicode_DATA(self);
11559 len = PyUnicode_GET_LENGTH(self);
11560 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11561 PyUnicode_DATA(sepobj),
11562 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011563
Benjamin Peterson14339b62009-01-31 16:36:08 +000011564 i = 0;
11565 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 while (i < len &&
11567 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 i++;
11569 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011570 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011571
Benjamin Peterson14339b62009-01-31 16:36:08 +000011572 j = len;
11573 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 do {
11575 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 } while (j >= i &&
11577 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011579 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011580
Victor Stinner7931d9a2011-11-04 00:22:48 +010011581 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582}
11583
11584PyObject*
11585PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11586{
11587 unsigned char *data;
11588 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011589 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590
Victor Stinnerde636f32011-10-01 03:55:54 +020011591 if (PyUnicode_READY(self) == -1)
11592 return NULL;
11593
11594 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11595
Victor Stinner12bab6d2011-10-01 01:53:49 +020011596 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011598 if (PyUnicode_CheckExact(self)) {
11599 Py_INCREF(self);
11600 return self;
11601 }
11602 else
11603 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 }
11605
Victor Stinner12bab6d2011-10-01 01:53:49 +020011606 length = end - start;
11607 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011608 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609
Victor Stinnerde636f32011-10-01 03:55:54 +020011610 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011611 PyErr_SetString(PyExc_IndexError, "string index out of range");
11612 return NULL;
11613 }
11614
Victor Stinnerb9275c12011-10-05 14:01:42 +020011615 if (PyUnicode_IS_ASCII(self)) {
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_1BYTE_DATA(self);
11618 return unicode_fromascii(data + start, length);
11619 }
11620 else {
11621 kind = PyUnicode_KIND(self);
11622 data = PyUnicode_1BYTE_DATA(self);
11623 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011624 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011625 length);
11626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 int kind;
11633 void *data;
11634 Py_ssize_t len, i, j;
11635
11636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
11638
11639 kind = PyUnicode_KIND(self);
11640 data = PyUnicode_DATA(self);
11641 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642
Benjamin Peterson14339b62009-01-31 16:36:08 +000011643 i = 0;
11644 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 i++;
11647 }
11648 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011649
Benjamin Peterson14339b62009-01-31 16:36:08 +000011650 j = len;
11651 if (striptype != LEFTSTRIP) {
11652 do {
11653 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011655 j++;
11656 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657
Victor Stinner7931d9a2011-11-04 00:22:48 +010011658 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659}
11660
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661
11662static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011665 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666
Benjamin Peterson14339b62009-01-31 16:36:08 +000011667 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11668 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 if (sep != NULL && sep != Py_None) {
11671 if (PyUnicode_Check(sep))
11672 return _PyUnicode_XStrip(self, striptype, sep);
11673 else {
11674 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 "%s arg must be None or str",
11676 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011677 return NULL;
11678 }
11679 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011680
Benjamin Peterson14339b62009-01-31 16:36:08 +000011681 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682}
11683
11684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011685PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687\n\
11688Return a copy of the string S with leading and trailing\n\
11689whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011690If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691
11692static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011693unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 if (PyTuple_GET_SIZE(args) == 0)
11696 return do_strip(self, BOTHSTRIP); /* Common case */
11697 else
11698 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699}
11700
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704\n\
11705Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011706If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707
11708static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011709unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011711 if (PyTuple_GET_SIZE(args) == 0)
11712 return do_strip(self, LEFTSTRIP); /* Common case */
11713 else
11714 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715}
11716
11717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720\n\
11721Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011722If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
11724static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011725unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011727 if (PyTuple_GET_SIZE(args) == 0)
11728 return do_strip(self, RIGHTSTRIP); /* Common case */
11729 else
11730 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011731}
11732
11733
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011735unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
Georg Brandl222de0f2009-04-12 12:01:50 +000011740 if (len < 1) {
11741 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011742 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
Tim Peters7a29bd52001-09-12 03:03:31 +000011745 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 /* no repeat, return original string */
11747 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011748 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 }
Tim Peters8f422462000-09-09 06:13:41 +000011750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (PyUnicode_READY(str) == -1)
11752 return NULL;
11753
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011754 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011755 PyErr_SetString(PyExc_OverflowError,
11756 "repeated string is too long");
11757 return NULL;
11758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011760
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011761 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 if (!u)
11763 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011764 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 if (PyUnicode_GET_LENGTH(str) == 1) {
11767 const int kind = PyUnicode_KIND(str);
11768 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11769 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011770 if (kind == PyUnicode_1BYTE_KIND)
11771 memset(to, (unsigned char)fill_char, len);
11772 else {
11773 for (n = 0; n < len; ++n)
11774 PyUnicode_WRITE(kind, to, n, fill_char);
11775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 }
11777 else {
11778 /* number of characters copied this far */
11779 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011780 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 char *to = (char *) PyUnicode_DATA(u);
11782 Py_MEMCPY(to, PyUnicode_DATA(str),
11783 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 n = (done <= nchars-done) ? done : nchars-done;
11786 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011787 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789 }
11790
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011791 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011792 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793}
11794
Alexander Belopolsky40018472011-02-26 01:02:56 +000011795PyObject *
11796PyUnicode_Replace(PyObject *obj,
11797 PyObject *subobj,
11798 PyObject *replobj,
11799 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800{
11801 PyObject *self;
11802 PyObject *str1;
11803 PyObject *str2;
11804 PyObject *result;
11805
11806 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011807 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011810 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 Py_DECREF(self);
11812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 }
11814 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011815 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 Py_DECREF(self);
11817 Py_DECREF(str1);
11818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 Py_DECREF(self);
11822 Py_DECREF(str1);
11823 Py_DECREF(str2);
11824 return result;
11825}
11826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011827PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011828 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829\n\
11830Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011831old replaced by new. If the optional argument count is\n\
11832given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833
11834static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 PyObject *str1;
11838 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011839 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 PyObject *result;
11841
Martin v. Löwis18e16552006-02-15 17:27:45 +000011842 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 str1 = PyUnicode_FromObject(str1);
11847 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11848 return NULL;
11849 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011850 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 Py_DECREF(str1);
11852 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011853 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854
11855 result = replace(self, str1, str2, maxcount);
11856
11857 Py_DECREF(str1);
11858 Py_DECREF(str2);
11859 return result;
11860}
11861
Alexander Belopolsky40018472011-02-26 01:02:56 +000011862static PyObject *
11863unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011865 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 Py_ssize_t isize;
11867 Py_ssize_t osize, squote, dquote, i, o;
11868 Py_UCS4 max, quote;
11869 int ikind, okind;
11870 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011873 return NULL;
11874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 isize = PyUnicode_GET_LENGTH(unicode);
11876 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 /* Compute length of output, quote characters, and
11879 maximum character */
11880 osize = 2; /* quotes */
11881 max = 127;
11882 squote = dquote = 0;
11883 ikind = PyUnicode_KIND(unicode);
11884 for (i = 0; i < isize; i++) {
11885 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11886 switch (ch) {
11887 case '\'': squote++; osize++; break;
11888 case '"': dquote++; osize++; break;
11889 case '\\': case '\t': case '\r': case '\n':
11890 osize += 2; break;
11891 default:
11892 /* Fast-path ASCII */
11893 if (ch < ' ' || ch == 0x7f)
11894 osize += 4; /* \xHH */
11895 else if (ch < 0x7f)
11896 osize++;
11897 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11898 osize++;
11899 max = ch > max ? ch : max;
11900 }
11901 else if (ch < 0x100)
11902 osize += 4; /* \xHH */
11903 else if (ch < 0x10000)
11904 osize += 6; /* \uHHHH */
11905 else
11906 osize += 10; /* \uHHHHHHHH */
11907 }
11908 }
11909
11910 quote = '\'';
11911 if (squote) {
11912 if (dquote)
11913 /* Both squote and dquote present. Use squote,
11914 and escape them */
11915 osize += squote;
11916 else
11917 quote = '"';
11918 }
11919
11920 repr = PyUnicode_New(osize, max);
11921 if (repr == NULL)
11922 return NULL;
11923 okind = PyUnicode_KIND(repr);
11924 odata = PyUnicode_DATA(repr);
11925
11926 PyUnicode_WRITE(okind, odata, 0, quote);
11927 PyUnicode_WRITE(okind, odata, osize-1, quote);
11928
11929 for (i = 0, o = 1; i < isize; i++) {
11930 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011931
11932 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if ((ch == quote) || (ch == '\\')) {
11934 PyUnicode_WRITE(okind, odata, o++, '\\');
11935 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011936 continue;
11937 }
11938
Benjamin Peterson29060642009-01-31 22:14:21 +000011939 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011940 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 PyUnicode_WRITE(okind, odata, o++, '\\');
11942 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011943 }
11944 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 PyUnicode_WRITE(okind, odata, o++, '\\');
11946 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011947 }
11948 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 PyUnicode_WRITE(okind, odata, o++, '\\');
11950 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011951 }
11952
11953 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011954 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 PyUnicode_WRITE(okind, odata, o++, '\\');
11956 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011959 }
11960
Georg Brandl559e5d72008-06-11 18:37:52 +000011961 /* Copy ASCII characters as-is */
11962 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011964 }
11965
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011967 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011969 (categories Z* and C* except ASCII space)
11970 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011972 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973 if (ch <= 0xff) {
11974 PyUnicode_WRITE(okind, odata, o++, '\\');
11975 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011976 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11977 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011978 }
11979 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 else if (ch >= 0x10000) {
11981 PyUnicode_WRITE(okind, odata, o++, '\\');
11982 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011983 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11984 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11985 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11986 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11987 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11988 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11989 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11990 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011991 }
11992 /* Map 16-bit characters to '\uxxxx' */
11993 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 PyUnicode_WRITE(okind, odata, o++, '\\');
11995 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011996 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11997 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11998 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11999 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012000 }
12001 }
12002 /* Copy characters as-is */
12003 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012005 }
12006 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012009 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011}
12012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012013PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012014 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012015\n\
12016Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012017such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018arguments start and end are interpreted as in slice notation.\n\
12019\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012020Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021
12022static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012025 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012026 Py_ssize_t start;
12027 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012028 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
Jesus Ceaac451502011-04-20 17:09:23 +020012030 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12031 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 if (PyUnicode_READY(self) == -1)
12035 return NULL;
12036 if (PyUnicode_READY(substring) == -1)
12037 return NULL;
12038
Victor Stinner7931d9a2011-11-04 00:22:48 +010012039 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
12041 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (result == -2)
12044 return NULL;
12045
Christian Heimes217cfd12007-12-02 14:31:20 +000012046 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047}
12048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012049PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012052Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053
12054static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012057 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012058 Py_ssize_t start;
12059 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012060 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Jesus Ceaac451502011-04-20 17:09:23 +020012062 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12063 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if (PyUnicode_READY(self) == -1)
12067 return NULL;
12068 if (PyUnicode_READY(substring) == -1)
12069 return NULL;
12070
Victor Stinner7931d9a2011-11-04 00:22:48 +010012071 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
12073 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (result == -2)
12076 return NULL;
12077
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078 if (result < 0) {
12079 PyErr_SetString(PyExc_ValueError, "substring not found");
12080 return NULL;
12081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082
Christian Heimes217cfd12007-12-02 14:31:20 +000012083 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084}
12085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012086PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012089Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012090done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
12092static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012093unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012095 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 Py_UCS4 fillchar = ' ';
12097
Victor Stinnere9a29352011-10-01 02:14:59 +020012098 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012100
Victor Stinnere9a29352011-10-01 02:14:59 +020012101 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 return NULL;
12103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012106 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107 }
12108
Victor Stinner7931d9a2011-11-04 00:22:48 +010012109 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110}
12111
Alexander Belopolsky40018472011-02-26 01:02:56 +000012112PyObject *
12113PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114{
12115 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012116
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 s = PyUnicode_FromObject(s);
12118 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 if (sep != NULL) {
12121 sep = PyUnicode_FromObject(sep);
12122 if (sep == NULL) {
12123 Py_DECREF(s);
12124 return NULL;
12125 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126 }
12127
Victor Stinner9310abb2011-10-05 00:59:23 +020012128 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129
12130 Py_DECREF(s);
12131 Py_XDECREF(sep);
12132 return result;
12133}
12134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012135PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137\n\
12138Return a list of the words in S, using sep as the\n\
12139delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012140splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012141whitespace string is a separator and empty strings are\n\
12142removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
12144static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012145unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146{
12147 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012148 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149
Martin v. Löwis18e16552006-02-15 17:27:45 +000012150 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 return NULL;
12152
12153 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012156 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012158 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159}
12160
Thomas Wouters477c8d52006-05-27 19:21:47 +000012161PyObject *
12162PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12163{
12164 PyObject* str_obj;
12165 PyObject* sep_obj;
12166 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 int kind1, kind2, kind;
12168 void *buf1 = NULL, *buf2 = NULL;
12169 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012170
12171 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012172 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012174 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012176 Py_DECREF(str_obj);
12177 return NULL;
12178 }
12179
Victor Stinner14f8f022011-10-05 20:58:25 +020012180 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012182 kind = Py_MAX(kind1, kind2);
12183 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012185 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (!buf1)
12187 goto onError;
12188 buf2 = PyUnicode_DATA(sep_obj);
12189 if (kind2 != kind)
12190 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12191 if (!buf2)
12192 goto onError;
12193 len1 = PyUnicode_GET_LENGTH(str_obj);
12194 len2 = PyUnicode_GET_LENGTH(sep_obj);
12195
Victor Stinner14f8f022011-10-05 20:58:25 +020012196 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012198 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12199 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12200 else
12201 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 break;
12203 case PyUnicode_2BYTE_KIND:
12204 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12205 break;
12206 case PyUnicode_4BYTE_KIND:
12207 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12208 break;
12209 default:
12210 assert(0);
12211 out = 0;
12212 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012213
12214 Py_DECREF(sep_obj);
12215 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (kind1 != kind)
12217 PyMem_Free(buf1);
12218 if (kind2 != kind)
12219 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220
12221 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 onError:
12223 Py_DECREF(sep_obj);
12224 Py_DECREF(str_obj);
12225 if (kind1 != kind && buf1)
12226 PyMem_Free(buf1);
12227 if (kind2 != kind && buf2)
12228 PyMem_Free(buf2);
12229 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012230}
12231
12232
12233PyObject *
12234PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12235{
12236 PyObject* str_obj;
12237 PyObject* sep_obj;
12238 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 int kind1, kind2, kind;
12240 void *buf1 = NULL, *buf2 = NULL;
12241 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012242
12243 str_obj = PyUnicode_FromObject(str_in);
12244 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246 sep_obj = PyUnicode_FromObject(sep_in);
12247 if (!sep_obj) {
12248 Py_DECREF(str_obj);
12249 return NULL;
12250 }
12251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 kind1 = PyUnicode_KIND(str_in);
12253 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012254 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 buf1 = PyUnicode_DATA(str_in);
12256 if (kind1 != kind)
12257 buf1 = _PyUnicode_AsKind(str_in, kind);
12258 if (!buf1)
12259 goto onError;
12260 buf2 = PyUnicode_DATA(sep_obj);
12261 if (kind2 != kind)
12262 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12263 if (!buf2)
12264 goto onError;
12265 len1 = PyUnicode_GET_LENGTH(str_obj);
12266 len2 = PyUnicode_GET_LENGTH(sep_obj);
12267
12268 switch(PyUnicode_KIND(str_in)) {
12269 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012270 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12271 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12272 else
12273 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 break;
12275 case PyUnicode_2BYTE_KIND:
12276 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12277 break;
12278 case PyUnicode_4BYTE_KIND:
12279 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12280 break;
12281 default:
12282 assert(0);
12283 out = 0;
12284 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012285
12286 Py_DECREF(sep_obj);
12287 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (kind1 != kind)
12289 PyMem_Free(buf1);
12290 if (kind2 != kind)
12291 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012292
12293 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 onError:
12295 Py_DECREF(sep_obj);
12296 Py_DECREF(str_obj);
12297 if (kind1 != kind && buf1)
12298 PyMem_Free(buf1);
12299 if (kind2 != kind && buf2)
12300 PyMem_Free(buf2);
12301 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012302}
12303
12304PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012307Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012309found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310
12311static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012312unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313{
Victor Stinner9310abb2011-10-05 00:59:23 +020012314 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012315}
12316
12317PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012318 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012319\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012320Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012321the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012322separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323
12324static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012325unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326{
Victor Stinner9310abb2011-10-05 00:59:23 +020012327 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328}
12329
Alexander Belopolsky40018472011-02-26 01:02:56 +000012330PyObject *
12331PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012332{
12333 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012334
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012335 s = PyUnicode_FromObject(s);
12336 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012337 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 if (sep != NULL) {
12339 sep = PyUnicode_FromObject(sep);
12340 if (sep == NULL) {
12341 Py_DECREF(s);
12342 return NULL;
12343 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012344 }
12345
Victor Stinner9310abb2011-10-05 00:59:23 +020012346 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347
12348 Py_DECREF(s);
12349 Py_XDECREF(sep);
12350 return result;
12351}
12352
12353PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012354 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012355\n\
12356Return a list of the words in S, using sep as the\n\
12357delimiter string, starting at the end of the string and\n\
12358working to the front. If maxsplit is given, at most maxsplit\n\
12359splits are done. If sep is not specified, any whitespace string\n\
12360is a separator.");
12361
12362static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012363unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012364{
12365 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012366 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012367
Martin v. Löwis18e16552006-02-15 17:27:45 +000012368 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012369 return NULL;
12370
12371 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012373 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012374 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012375 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012376 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012377}
12378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012379PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012380 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381\n\
12382Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012383Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012384is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
12386static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012387unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012389 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012390 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012392 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12393 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 return NULL;
12395
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012396 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397}
12398
12399static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012400PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012401{
Walter Dörwald346737f2007-05-31 10:44:43 +000012402 if (PyUnicode_CheckExact(self)) {
12403 Py_INCREF(self);
12404 return self;
12405 } else
12406 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012407 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408}
12409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012410PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412\n\
12413Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012414and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415
12416static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012417unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 return fixup(self, fixswapcase);
12420}
12421
Georg Brandlceee0772007-11-27 23:48:05 +000012422PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012424\n\
12425Return a translation table usable for str.translate().\n\
12426If there is only one argument, it must be a dictionary mapping Unicode\n\
12427ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012428Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012429If there are two arguments, they must be strings of equal length, and\n\
12430in the resulting dictionary, each character in x will be mapped to the\n\
12431character at the same position in y. If there is a third argument, it\n\
12432must be a string, whose characters will be mapped to None in the result.");
12433
12434static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012435unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012436{
12437 PyObject *x, *y = NULL, *z = NULL;
12438 PyObject *new = NULL, *key, *value;
12439 Py_ssize_t i = 0;
12440 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012441
Georg Brandlceee0772007-11-27 23:48:05 +000012442 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12443 return NULL;
12444 new = PyDict_New();
12445 if (!new)
12446 return NULL;
12447 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 int x_kind, y_kind, z_kind;
12449 void *x_data, *y_data, *z_data;
12450
Georg Brandlceee0772007-11-27 23:48:05 +000012451 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012452 if (!PyUnicode_Check(x)) {
12453 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12454 "be a string if there is a second argument");
12455 goto err;
12456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012458 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12459 "arguments must have equal length");
12460 goto err;
12461 }
12462 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012463 x_kind = PyUnicode_KIND(x);
12464 y_kind = PyUnicode_KIND(y);
12465 x_data = PyUnicode_DATA(x);
12466 y_data = PyUnicode_DATA(y);
12467 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12468 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12469 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012470 if (!key || !value)
12471 goto err;
12472 res = PyDict_SetItem(new, key, value);
12473 Py_DECREF(key);
12474 Py_DECREF(value);
12475 if (res < 0)
12476 goto err;
12477 }
12478 /* create entries for deleting chars in z */
12479 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480 z_kind = PyUnicode_KIND(z);
12481 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012482 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012484 if (!key)
12485 goto err;
12486 res = PyDict_SetItem(new, key, Py_None);
12487 Py_DECREF(key);
12488 if (res < 0)
12489 goto err;
12490 }
12491 }
12492 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 int kind;
12494 void *data;
12495
Georg Brandlceee0772007-11-27 23:48:05 +000012496 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012497 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012498 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12499 "to maketrans it must be a dict");
12500 goto err;
12501 }
12502 /* copy entries into the new dict, converting string keys to int keys */
12503 while (PyDict_Next(x, &i, &key, &value)) {
12504 if (PyUnicode_Check(key)) {
12505 /* convert string keys to integer keys */
12506 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012507 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012508 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12509 "table must be of length 1");
12510 goto err;
12511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 kind = PyUnicode_KIND(key);
12513 data = PyUnicode_DATA(key);
12514 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012515 if (!newkey)
12516 goto err;
12517 res = PyDict_SetItem(new, newkey, value);
12518 Py_DECREF(newkey);
12519 if (res < 0)
12520 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012521 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012522 /* just keep integer keys */
12523 if (PyDict_SetItem(new, key, value) < 0)
12524 goto err;
12525 } else {
12526 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12527 "be strings or integers");
12528 goto err;
12529 }
12530 }
12531 }
12532 return new;
12533 err:
12534 Py_DECREF(new);
12535 return NULL;
12536}
12537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012538PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012539 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540\n\
12541Return a copy of the string S, where all characters have been mapped\n\
12542through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012543Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012544Unmapped characters are left untouched. Characters mapped to None\n\
12545are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
12547static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551}
12552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012553PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012556Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012559unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561 return fixup(self, fixupper);
12562}
12563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012564PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012567Pad a numeric string S with zeros on the left, to fill a field\n\
12568of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569
12570static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012571unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012573 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012574 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012575 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 int kind;
12577 void *data;
12578 Py_UCS4 chr;
12579
12580 if (PyUnicode_READY(self) == -1)
12581 return NULL;
12582
Martin v. Löwis18e16552006-02-15 17:27:45 +000012583 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 return NULL;
12585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012587 if (PyUnicode_CheckExact(self)) {
12588 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012589 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012590 }
12591 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012592 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 }
12594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
12597 u = pad(self, fill, 0, '0');
12598
Walter Dörwald068325e2002-04-15 13:36:47 +000012599 if (u == NULL)
12600 return NULL;
12601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 kind = PyUnicode_KIND(u);
12603 data = PyUnicode_DATA(u);
12604 chr = PyUnicode_READ(kind, data, fill);
12605
12606 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 PyUnicode_WRITE(kind, data, 0, chr);
12609 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 }
12611
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012612 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012613 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615
12616#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012617static PyObject *
12618unicode__decimal2ascii(PyObject *self)
12619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012621}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622#endif
12623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012624PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012627Return True if S starts with the specified prefix, False otherwise.\n\
12628With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012629With optional end, stop comparing S at that position.\n\
12630prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
12632static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012633unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012634 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012636 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012637 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012638 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012639 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012640 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
Jesus Ceaac451502011-04-20 17:09:23 +020012642 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012643 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012644 if (PyTuple_Check(subobj)) {
12645 Py_ssize_t i;
12646 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012647 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012648 if (substring == NULL)
12649 return NULL;
12650 result = tailmatch(self, substring, start, end, -1);
12651 Py_DECREF(substring);
12652 if (result) {
12653 Py_RETURN_TRUE;
12654 }
12655 }
12656 /* nothing matched */
12657 Py_RETURN_FALSE;
12658 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012659 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012660 if (substring == NULL) {
12661 if (PyErr_ExceptionMatches(PyExc_TypeError))
12662 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12663 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012665 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669}
12670
12671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012672PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012673 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012675Return True if S ends with the specified suffix, False otherwise.\n\
12676With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012677With optional end, stop comparing S at that position.\n\
12678suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
12680static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012681unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012682 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012684 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012685 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012686 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012687 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012688 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689
Jesus Ceaac451502011-04-20 17:09:23 +020012690 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012692 if (PyTuple_Check(subobj)) {
12693 Py_ssize_t i;
12694 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012695 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012699 result = tailmatch(self, substring, start, end, +1);
12700 Py_DECREF(substring);
12701 if (result) {
12702 Py_RETURN_TRUE;
12703 }
12704 }
12705 Py_RETURN_FALSE;
12706 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012707 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012708 if (substring == NULL) {
12709 if (PyErr_ExceptionMatches(PyExc_TypeError))
12710 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12711 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012714 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012716 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717}
12718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012720
12721PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012723\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012724Return a formatted version of S, using substitutions from args and kwargs.\n\
12725The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012726
Eric Smith27bbca62010-11-04 17:06:58 +000012727PyDoc_STRVAR(format_map__doc__,
12728 "S.format_map(mapping) -> str\n\
12729\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012730Return a formatted version of S, using substitutions from mapping.\n\
12731The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012732
Eric Smith4a7d76d2008-05-30 18:10:19 +000012733static PyObject *
12734unicode__format__(PyObject* self, PyObject* args)
12735{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012736 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012737
12738 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12739 return NULL;
12740
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012741 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012743 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012744}
12745
Eric Smith8c663262007-08-25 02:26:07 +000012746PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012748\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012749Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012750
12751static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012752unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012753{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 Py_ssize_t size;
12755
12756 /* If it's a compact object, account for base structure +
12757 character data. */
12758 if (PyUnicode_IS_COMPACT_ASCII(v))
12759 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12760 else if (PyUnicode_IS_COMPACT(v))
12761 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012762 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 else {
12764 /* If it is a two-block object, account for base object, and
12765 for character block if present. */
12766 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012767 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012769 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 }
12771 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012772 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012773 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012775 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012776 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777
12778 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012779}
12780
12781PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012783
12784static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012785unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012786{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012787 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 if (!copy)
12789 return NULL;
12790 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012791}
12792
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793static PyMethodDef unicode_methods[] = {
12794
12795 /* Order is according to common usage: often used methods should
12796 appear first, since lookup is done sequentially. */
12797
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012798 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012799 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12800 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012801 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012802 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12803 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12804 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12805 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12806 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12807 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12808 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012810 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12811 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12812 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012813 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012814 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12815 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12816 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012817 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012819 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012820 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012821 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12822 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12823 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12824 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12825 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12826 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12827 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12828 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12829 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12830 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12831 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12832 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12833 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12834 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012835 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012836 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012837 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012838 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012839 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012840 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012841 {"maketrans", (PyCFunction) unicode_maketrans,
12842 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012843 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012844#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012845 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846#endif
12847
12848#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012849 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012850 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851#endif
12852
Benjamin Peterson14339b62009-01-31 16:36:08 +000012853 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854 {NULL, NULL}
12855};
12856
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012857static PyObject *
12858unicode_mod(PyObject *v, PyObject *w)
12859{
Brian Curtindfc80e32011-08-10 20:28:54 -050012860 if (!PyUnicode_Check(v))
12861 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012863}
12864
12865static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012866 0, /*nb_add*/
12867 0, /*nb_subtract*/
12868 0, /*nb_multiply*/
12869 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012870};
12871
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 (lenfunc) unicode_length, /* sq_length */
12874 PyUnicode_Concat, /* sq_concat */
12875 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12876 (ssizeargfunc) unicode_getitem, /* sq_item */
12877 0, /* sq_slice */
12878 0, /* sq_ass_item */
12879 0, /* sq_ass_slice */
12880 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881};
12882
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012883static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012884unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 if (PyUnicode_READY(self) == -1)
12887 return NULL;
12888
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012889 if (PyIndex_Check(item)) {
12890 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012891 if (i == -1 && PyErr_Occurred())
12892 return NULL;
12893 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012895 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012896 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012897 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012898 PyObject *result;
12899 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012900 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012901 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012904 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012905 return NULL;
12906 }
12907
12908 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 return PyUnicode_New(0, 0);
12910 } else if (start == 0 && step == 1 &&
12911 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012912 PyUnicode_CheckExact(self)) {
12913 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012914 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012915 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012916 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012917 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012918 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012919 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012920 src_kind = PyUnicode_KIND(self);
12921 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012922 if (!PyUnicode_IS_ASCII(self)) {
12923 kind_limit = kind_maxchar_limit(src_kind);
12924 max_char = 0;
12925 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12926 ch = PyUnicode_READ(src_kind, src_data, cur);
12927 if (ch > max_char) {
12928 max_char = ch;
12929 if (max_char >= kind_limit)
12930 break;
12931 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012932 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012933 }
Victor Stinner55c99112011-10-13 01:17:06 +020012934 else
12935 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012936 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012937 if (result == NULL)
12938 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012939 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012940 dest_data = PyUnicode_DATA(result);
12941
12942 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012943 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12944 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012945 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012946 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012947 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012948 } else {
12949 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12950 return NULL;
12951 }
12952}
12953
12954static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012955 (lenfunc)unicode_length, /* mp_length */
12956 (binaryfunc)unicode_subscript, /* mp_subscript */
12957 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012958};
12959
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961/* Helpers for PyUnicode_Format() */
12962
12963static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012964getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012966 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 (*p_argidx)++;
12969 if (arglen < 0)
12970 return args;
12971 else
12972 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012973 }
12974 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012975 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976 return NULL;
12977}
12978
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012979/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012981static PyObject *
12982formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012984 char *p;
12985 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012987
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988 x = PyFloat_AsDouble(v);
12989 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012990 return NULL;
12991
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012993 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012994
Eric Smith0923d1d2009-04-16 20:16:10 +000012995 p = PyOS_double_to_string(x, type, prec,
12996 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012997 if (p == NULL)
12998 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013000 PyMem_Free(p);
13001 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002}
13003
Tim Peters38fd5b62000-09-21 05:43:11 +000013004static PyObject*
13005formatlong(PyObject *val, int flags, int prec, int type)
13006{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 char *buf;
13008 int len;
13009 PyObject *str; /* temporary string object. */
13010 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013011
Benjamin Peterson14339b62009-01-31 16:36:08 +000013012 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13013 if (!str)
13014 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 Py_DECREF(str);
13017 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013018}
13019
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013020static Py_UCS4
13021formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013023 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013024 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013026 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013027 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013028 goto onError;
13029 }
13030 else {
13031 /* Integer input truncated to a character */
13032 long x;
13033 x = PyLong_AsLong(v);
13034 if (x == -1 && PyErr_Occurred())
13035 goto onError;
13036
Victor Stinner8faf8212011-12-08 22:14:11 +010013037 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013038 PyErr_SetString(PyExc_OverflowError,
13039 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013040 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 }
13042
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013043 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013044 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013045
Benjamin Peterson29060642009-01-31 22:14:21 +000013046 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013047 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013049 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050}
13051
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013052static int
13053repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13054{
13055 int r;
13056 assert(count > 0);
13057 assert(PyUnicode_Check(obj));
13058 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013059 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013060 if (repeated == NULL)
13061 return -1;
13062 r = _PyAccu_Accumulate(acc, repeated);
13063 Py_DECREF(repeated);
13064 return r;
13065 }
13066 else {
13067 do {
13068 if (_PyAccu_Accumulate(acc, obj))
13069 return -1;
13070 } while (--count);
13071 return 0;
13072 }
13073}
13074
Alexander Belopolsky40018472011-02-26 01:02:56 +000013075PyObject *
13076PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 void *fmt;
13079 int fmtkind;
13080 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013082 int r;
13083 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013086 PyObject *temp = NULL;
13087 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013088 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013089 _PyAccu acc;
13090 static PyObject *plus, *minus, *blank, *zero, *percent;
13091
13092 if (!plus && !(plus = get_latin1_char('+')))
13093 return NULL;
13094 if (!minus && !(minus = get_latin1_char('-')))
13095 return NULL;
13096 if (!blank && !(blank = get_latin1_char(' ')))
13097 return NULL;
13098 if (!zero && !(zero = get_latin1_char('0')))
13099 return NULL;
13100 if (!percent && !(percent = get_latin1_char('%')))
13101 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013102
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 PyErr_BadInternalCall();
13105 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013107 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013110 if (_PyAccu_Init(&acc))
13111 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 fmt = PyUnicode_DATA(uformat);
13113 fmtkind = PyUnicode_KIND(uformat);
13114 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13115 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 arglen = PyTuple_Size(args);
13119 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120 }
13121 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 arglen = -1;
13123 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013125 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013126 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128
13129 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013131 PyObject *nonfmt;
13132 Py_ssize_t nonfmtpos;
13133 nonfmtpos = fmtpos++;
13134 while (fmtcnt >= 0 &&
13135 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13136 fmtpos++;
13137 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013139 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013140 if (nonfmt == NULL)
13141 goto onError;
13142 r = _PyAccu_Accumulate(&acc, nonfmt);
13143 Py_DECREF(nonfmt);
13144 if (r)
13145 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 }
13147 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 /* Got a format specifier */
13149 int flags = 0;
13150 Py_ssize_t width = -1;
13151 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013153 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 int isnumok;
13155 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013156 void *pbuf = NULL;
13157 Py_ssize_t pindex, len;
13158 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 fmtpos++;
13161 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13162 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 Py_ssize_t keylen;
13164 PyObject *key;
13165 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013166
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 if (dict == NULL) {
13168 PyErr_SetString(PyExc_TypeError,
13169 "format requires a mapping");
13170 goto onError;
13171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013172 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013174 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 /* Skip over balanced parentheses */
13176 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 if (fmtcnt < 0 || pcount > 0) {
13185 PyErr_SetString(PyExc_ValueError,
13186 "incomplete format key");
13187 goto onError;
13188 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013189 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013190 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 if (key == NULL)
13192 goto onError;
13193 if (args_owned) {
13194 Py_DECREF(args);
13195 args_owned = 0;
13196 }
13197 args = PyObject_GetItem(dict, key);
13198 Py_DECREF(key);
13199 if (args == NULL) {
13200 goto onError;
13201 }
13202 args_owned = 1;
13203 arglen = -1;
13204 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 case '-': flags |= F_LJUST; continue;
13209 case '+': flags |= F_SIGN; continue;
13210 case ' ': flags |= F_BLANK; continue;
13211 case '#': flags |= F_ALT; continue;
13212 case '0': flags |= F_ZERO; continue;
13213 }
13214 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 if (c == '*') {
13217 v = getnextarg(args, arglen, &argidx);
13218 if (v == NULL)
13219 goto onError;
13220 if (!PyLong_Check(v)) {
13221 PyErr_SetString(PyExc_TypeError,
13222 "* wants int");
13223 goto onError;
13224 }
13225 width = PyLong_AsLong(v);
13226 if (width == -1 && PyErr_Occurred())
13227 goto onError;
13228 if (width < 0) {
13229 flags |= F_LJUST;
13230 width = -width;
13231 }
13232 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 }
13235 else if (c >= '0' && c <= '9') {
13236 width = c - '0';
13237 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 if (c < '0' || c > '9')
13240 break;
13241 if ((width*10) / 10 != width) {
13242 PyErr_SetString(PyExc_ValueError,
13243 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013244 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013245 }
13246 width = width*10 + (c - '0');
13247 }
13248 }
13249 if (c == '.') {
13250 prec = 0;
13251 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013253 if (c == '*') {
13254 v = getnextarg(args, arglen, &argidx);
13255 if (v == NULL)
13256 goto onError;
13257 if (!PyLong_Check(v)) {
13258 PyErr_SetString(PyExc_TypeError,
13259 "* wants int");
13260 goto onError;
13261 }
13262 prec = PyLong_AsLong(v);
13263 if (prec == -1 && PyErr_Occurred())
13264 goto onError;
13265 if (prec < 0)
13266 prec = 0;
13267 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 }
13270 else if (c >= '0' && c <= '9') {
13271 prec = c - '0';
13272 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013273 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 if (c < '0' || c > '9')
13275 break;
13276 if ((prec*10) / 10 != prec) {
13277 PyErr_SetString(PyExc_ValueError,
13278 "prec too big");
13279 goto onError;
13280 }
13281 prec = prec*10 + (c - '0');
13282 }
13283 }
13284 } /* prec */
13285 if (fmtcnt >= 0) {
13286 if (c == 'h' || c == 'l' || c == 'L') {
13287 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 }
13290 }
13291 if (fmtcnt < 0) {
13292 PyErr_SetString(PyExc_ValueError,
13293 "incomplete format");
13294 goto onError;
13295 }
13296 if (c != '%') {
13297 v = getnextarg(args, arglen, &argidx);
13298 if (v == NULL)
13299 goto onError;
13300 }
13301 sign = 0;
13302 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013303 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 switch (c) {
13305
13306 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 _PyAccu_Accumulate(&acc, percent);
13308 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013309
13310 case 's':
13311 case 'r':
13312 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013313 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 temp = v;
13315 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013316 }
13317 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 if (c == 's')
13319 temp = PyObject_Str(v);
13320 else if (c == 'r')
13321 temp = PyObject_Repr(v);
13322 else
13323 temp = PyObject_ASCII(v);
13324 if (temp == NULL)
13325 goto onError;
13326 if (PyUnicode_Check(temp))
13327 /* nothing to do */;
13328 else {
13329 Py_DECREF(temp);
13330 PyErr_SetString(PyExc_TypeError,
13331 "%s argument has non-string str()");
13332 goto onError;
13333 }
13334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 if (PyUnicode_READY(temp) == -1) {
13336 Py_CLEAR(temp);
13337 goto onError;
13338 }
13339 pbuf = PyUnicode_DATA(temp);
13340 kind = PyUnicode_KIND(temp);
13341 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 if (prec >= 0 && len > prec)
13343 len = prec;
13344 break;
13345
13346 case 'i':
13347 case 'd':
13348 case 'u':
13349 case 'o':
13350 case 'x':
13351 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 isnumok = 0;
13353 if (PyNumber_Check(v)) {
13354 PyObject *iobj=NULL;
13355
13356 if (PyLong_Check(v)) {
13357 iobj = v;
13358 Py_INCREF(iobj);
13359 }
13360 else {
13361 iobj = PyNumber_Long(v);
13362 }
13363 if (iobj!=NULL) {
13364 if (PyLong_Check(iobj)) {
13365 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013366 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 Py_DECREF(iobj);
13368 if (!temp)
13369 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 if (PyUnicode_READY(temp) == -1) {
13371 Py_CLEAR(temp);
13372 goto onError;
13373 }
13374 pbuf = PyUnicode_DATA(temp);
13375 kind = PyUnicode_KIND(temp);
13376 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 sign = 1;
13378 }
13379 else {
13380 Py_DECREF(iobj);
13381 }
13382 }
13383 }
13384 if (!isnumok) {
13385 PyErr_Format(PyExc_TypeError,
13386 "%%%c format: a number is required, "
13387 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13388 goto onError;
13389 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013390 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392 fillobj = zero;
13393 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 break;
13395
13396 case 'e':
13397 case 'E':
13398 case 'f':
13399 case 'F':
13400 case 'g':
13401 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013402 temp = formatfloat(v, flags, prec, c);
13403 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 if (PyUnicode_READY(temp) == -1) {
13406 Py_CLEAR(temp);
13407 goto onError;
13408 }
13409 pbuf = PyUnicode_DATA(temp);
13410 kind = PyUnicode_KIND(temp);
13411 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013413 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013415 fillobj = zero;
13416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 break;
13418
13419 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 {
13421 Py_UCS4 ch = formatchar(v);
13422 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013424 temp = _PyUnicode_FromUCS4(&ch, 1);
13425 if (temp == NULL)
13426 goto onError;
13427 pbuf = PyUnicode_DATA(temp);
13428 kind = PyUnicode_KIND(temp);
13429 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013431 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013432
13433 default:
13434 PyErr_Format(PyExc_ValueError,
13435 "unsupported format character '%c' (0x%x) "
13436 "at index %zd",
13437 (31<=c && c<=126) ? (char)c : '?',
13438 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 goto onError;
13441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 /* pbuf is initialized here. */
13443 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13446 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013448 pindex++;
13449 }
13450 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13451 signobj = plus;
13452 len--;
13453 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 }
13455 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013456 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013458 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 else
13460 sign = 0;
13461 }
13462 if (width < len)
13463 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013465 if (fill != ' ') {
13466 assert(signobj != NULL);
13467 if (_PyAccu_Accumulate(&acc, signobj))
13468 goto onError;
13469 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 if (width > len)
13471 width--;
13472 }
13473 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013476 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 second = get_latin1_char(
13478 PyUnicode_READ(kind, pbuf, pindex + 1));
13479 pindex += 2;
13480 if (second == NULL ||
13481 _PyAccu_Accumulate(&acc, zero) ||
13482 _PyAccu_Accumulate(&acc, second))
13483 goto onError;
13484 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013486 width -= 2;
13487 if (width < 0)
13488 width = 0;
13489 len -= 2;
13490 }
13491 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013492 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013493 if (repeat_accumulate(&acc, fillobj, width - len))
13494 goto onError;
13495 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 }
13497 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013498 if (sign) {
13499 assert(signobj != NULL);
13500 if (_PyAccu_Accumulate(&acc, signobj))
13501 goto onError;
13502 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13505 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013506 second = get_latin1_char(
13507 PyUnicode_READ(kind, pbuf, pindex + 1));
13508 pindex += 2;
13509 if (second == NULL ||
13510 _PyAccu_Accumulate(&acc, zero) ||
13511 _PyAccu_Accumulate(&acc, second))
13512 goto onError;
13513 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013514 }
13515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013516 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 if (temp != NULL) {
13518 assert(pbuf == PyUnicode_DATA(temp));
13519 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013521 else {
13522 const char *p = (const char *) pbuf;
13523 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013524 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013525 v = PyUnicode_FromKindAndData(kind, p, len);
13526 }
13527 if (v == NULL)
13528 goto onError;
13529 r = _PyAccu_Accumulate(&acc, v);
13530 Py_DECREF(v);
13531 if (r)
13532 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013533 if (width > len && repeat_accumulate(&acc, blank, width - len))
13534 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 if (dict && (argidx < arglen) && c != '%') {
13536 PyErr_SetString(PyExc_TypeError,
13537 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 goto onError;
13539 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013542 } /* until end */
13543 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 PyErr_SetString(PyExc_TypeError,
13545 "not all arguments converted during string formatting");
13546 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013547 }
13548
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013549 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013550 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013551 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013552 }
13553 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 Py_XDECREF(temp);
13555 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013556 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013560 Py_XDECREF(temp);
13561 Py_XDECREF(second);
13562 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013565 }
13566 return NULL;
13567}
13568
Jeremy Hylton938ace62002-07-17 16:30:39 +000013569static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013570unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13571
Tim Peters6d6c1a32001-08-02 04:15:00 +000013572static PyObject *
13573unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13574{
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013576 static char *kwlist[] = {"object", "encoding", "errors", 0};
13577 char *encoding = NULL;
13578 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013579
Benjamin Peterson14339b62009-01-31 16:36:08 +000013580 if (type != &PyUnicode_Type)
13581 return unicode_subtype_new(type, args, kwds);
13582 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013584 return NULL;
13585 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013586 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013587 if (encoding == NULL && errors == NULL)
13588 return PyObject_Str(x);
13589 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013591}
13592
Guido van Rossume023fe02001-08-30 03:12:59 +000013593static PyObject *
13594unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13595{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013596 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013597 Py_ssize_t length, char_size;
13598 int share_wstr, share_utf8;
13599 unsigned int kind;
13600 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013601
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013603
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013604 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013605 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013607 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013608 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013609 return NULL;
13610
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013611 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013612 if (self == NULL) {
13613 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013614 return NULL;
13615 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013616 kind = PyUnicode_KIND(unicode);
13617 length = PyUnicode_GET_LENGTH(unicode);
13618
13619 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013620#ifdef Py_DEBUG
13621 _PyUnicode_HASH(self) = -1;
13622#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013623 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013624#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013625 _PyUnicode_STATE(self).interned = 0;
13626 _PyUnicode_STATE(self).kind = kind;
13627 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013628 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013629 _PyUnicode_STATE(self).ready = 1;
13630 _PyUnicode_WSTR(self) = NULL;
13631 _PyUnicode_UTF8_LENGTH(self) = 0;
13632 _PyUnicode_UTF8(self) = NULL;
13633 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013634 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635
13636 share_utf8 = 0;
13637 share_wstr = 0;
13638 if (kind == PyUnicode_1BYTE_KIND) {
13639 char_size = 1;
13640 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13641 share_utf8 = 1;
13642 }
13643 else if (kind == PyUnicode_2BYTE_KIND) {
13644 char_size = 2;
13645 if (sizeof(wchar_t) == 2)
13646 share_wstr = 1;
13647 }
13648 else {
13649 assert(kind == PyUnicode_4BYTE_KIND);
13650 char_size = 4;
13651 if (sizeof(wchar_t) == 4)
13652 share_wstr = 1;
13653 }
13654
13655 /* Ensure we won't overflow the length. */
13656 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13657 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013659 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013660 data = PyObject_MALLOC((length + 1) * char_size);
13661 if (data == NULL) {
13662 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013663 goto onError;
13664 }
13665
Victor Stinnerc3c74152011-10-02 20:39:55 +020013666 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013667 if (share_utf8) {
13668 _PyUnicode_UTF8_LENGTH(self) = length;
13669 _PyUnicode_UTF8(self) = data;
13670 }
13671 if (share_wstr) {
13672 _PyUnicode_WSTR_LENGTH(self) = length;
13673 _PyUnicode_WSTR(self) = (wchar_t *)data;
13674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013676 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013677 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013678 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013679#ifdef Py_DEBUG
13680 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13681#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013682 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013683 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013684
13685onError:
13686 Py_DECREF(unicode);
13687 Py_DECREF(self);
13688 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013689}
13690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013691PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013693\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013694Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013695encoding defaults to the current default string encoding.\n\
13696errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013697
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013698static PyObject *unicode_iter(PyObject *seq);
13699
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013701 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013702 "str", /* tp_name */
13703 sizeof(PyUnicodeObject), /* tp_size */
13704 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013705 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 (destructor)unicode_dealloc, /* tp_dealloc */
13707 0, /* tp_print */
13708 0, /* tp_getattr */
13709 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013710 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013711 unicode_repr, /* tp_repr */
13712 &unicode_as_number, /* tp_as_number */
13713 &unicode_as_sequence, /* tp_as_sequence */
13714 &unicode_as_mapping, /* tp_as_mapping */
13715 (hashfunc) unicode_hash, /* tp_hash*/
13716 0, /* tp_call*/
13717 (reprfunc) unicode_str, /* tp_str */
13718 PyObject_GenericGetAttr, /* tp_getattro */
13719 0, /* tp_setattro */
13720 0, /* tp_as_buffer */
13721 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013723 unicode_doc, /* tp_doc */
13724 0, /* tp_traverse */
13725 0, /* tp_clear */
13726 PyUnicode_RichCompare, /* tp_richcompare */
13727 0, /* tp_weaklistoffset */
13728 unicode_iter, /* tp_iter */
13729 0, /* tp_iternext */
13730 unicode_methods, /* tp_methods */
13731 0, /* tp_members */
13732 0, /* tp_getset */
13733 &PyBaseObject_Type, /* tp_base */
13734 0, /* tp_dict */
13735 0, /* tp_descr_get */
13736 0, /* tp_descr_set */
13737 0, /* tp_dictoffset */
13738 0, /* tp_init */
13739 0, /* tp_alloc */
13740 unicode_new, /* tp_new */
13741 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013742};
13743
13744/* Initialize the Unicode implementation */
13745
Victor Stinner3a50e702011-10-18 21:21:00 +020013746int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013748 int i;
13749
Thomas Wouters477c8d52006-05-27 19:21:47 +000013750 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013752 0x000A, /* LINE FEED */
13753 0x000D, /* CARRIAGE RETURN */
13754 0x001C, /* FILE SEPARATOR */
13755 0x001D, /* GROUP SEPARATOR */
13756 0x001E, /* RECORD SEPARATOR */
13757 0x0085, /* NEXT LINE */
13758 0x2028, /* LINE SEPARATOR */
13759 0x2029, /* PARAGRAPH SEPARATOR */
13760 };
13761
Fred Drakee4315f52000-05-09 19:53:39 +000013762 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013763 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013764 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013765 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013766 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013767
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013768 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013769 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013770 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013772
13773 /* initialize the linebreak bloom filter */
13774 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013775 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013776 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013777
13778 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013779
13780#ifdef HAVE_MBCS
13781 winver.dwOSVersionInfoSize = sizeof(winver);
13782 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13783 PyErr_SetFromWindowsErr(0);
13784 return -1;
13785 }
13786#endif
13787 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013788}
13789
13790/* Finalize the Unicode implementation */
13791
Christian Heimesa156e092008-02-16 07:38:31 +000013792int
13793PyUnicode_ClearFreeList(void)
13794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013796}
13797
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798void
Thomas Wouters78890102000-07-22 19:25:51 +000013799_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013800{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013801 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013803 Py_XDECREF(unicode_empty);
13804 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013806 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013807 if (unicode_latin1[i]) {
13808 Py_DECREF(unicode_latin1[i]);
13809 unicode_latin1[i] = NULL;
13810 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013811 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013812 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013813 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013815
Walter Dörwald16807132007-05-25 13:52:07 +000013816void
13817PyUnicode_InternInPlace(PyObject **p)
13818{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013819 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013820 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013821#ifdef Py_DEBUG
13822 assert(s != NULL);
13823 assert(_PyUnicode_CHECK(s));
13824#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013825 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013826 return;
13827#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013828 /* If it's a subclass, we don't really know what putting
13829 it in the interned dict might do. */
13830 if (!PyUnicode_CheckExact(s))
13831 return;
13832 if (PyUnicode_CHECK_INTERNED(s))
13833 return;
13834 if (interned == NULL) {
13835 interned = PyDict_New();
13836 if (interned == NULL) {
13837 PyErr_Clear(); /* Don't leave an exception */
13838 return;
13839 }
13840 }
13841 /* It might be that the GetItem call fails even
13842 though the key is present in the dictionary,
13843 namely when this happens during a stack overflow. */
13844 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013845 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013847
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 if (t) {
13849 Py_INCREF(t);
13850 Py_DECREF(*p);
13851 *p = t;
13852 return;
13853 }
Walter Dörwald16807132007-05-25 13:52:07 +000013854
Benjamin Peterson14339b62009-01-31 16:36:08 +000013855 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013856 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013857 PyErr_Clear();
13858 PyThreadState_GET()->recursion_critical = 0;
13859 return;
13860 }
13861 PyThreadState_GET()->recursion_critical = 0;
13862 /* The two references in interned are not counted by refcnt.
13863 The deallocator will take care of this */
13864 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013865 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013866}
13867
13868void
13869PyUnicode_InternImmortal(PyObject **p)
13870{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 PyUnicode_InternInPlace(p);
13872 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013873 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 Py_INCREF(*p);
13875 }
Walter Dörwald16807132007-05-25 13:52:07 +000013876}
13877
13878PyObject *
13879PyUnicode_InternFromString(const char *cp)
13880{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013881 PyObject *s = PyUnicode_FromString(cp);
13882 if (s == NULL)
13883 return NULL;
13884 PyUnicode_InternInPlace(&s);
13885 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013886}
13887
Alexander Belopolsky40018472011-02-26 01:02:56 +000013888void
13889_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013890{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013892 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013893 Py_ssize_t i, n;
13894 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013895
Benjamin Peterson14339b62009-01-31 16:36:08 +000013896 if (interned == NULL || !PyDict_Check(interned))
13897 return;
13898 keys = PyDict_Keys(interned);
13899 if (keys == NULL || !PyList_Check(keys)) {
13900 PyErr_Clear();
13901 return;
13902 }
Walter Dörwald16807132007-05-25 13:52:07 +000013903
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13905 detector, interned unicode strings are not forcibly deallocated;
13906 rather, we give them their stolen references back, and then clear
13907 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013908
Benjamin Peterson14339b62009-01-31 16:36:08 +000013909 n = PyList_GET_SIZE(keys);
13910 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013912 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013914 if (PyUnicode_READY(s) == -1) {
13915 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013916 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013918 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 case SSTATE_NOT_INTERNED:
13920 /* XXX Shouldn't happen */
13921 break;
13922 case SSTATE_INTERNED_IMMORTAL:
13923 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013924 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013925 break;
13926 case SSTATE_INTERNED_MORTAL:
13927 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013929 break;
13930 default:
13931 Py_FatalError("Inconsistent interned string state.");
13932 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013933 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013934 }
13935 fprintf(stderr, "total size of all interned strings: "
13936 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13937 "mortal/immortal\n", mortal_size, immortal_size);
13938 Py_DECREF(keys);
13939 PyDict_Clear(interned);
13940 Py_DECREF(interned);
13941 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013942}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013943
13944
13945/********************* Unicode Iterator **************************/
13946
13947typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013948 PyObject_HEAD
13949 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013950 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013951} unicodeiterobject;
13952
13953static void
13954unicodeiter_dealloc(unicodeiterobject *it)
13955{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013956 _PyObject_GC_UNTRACK(it);
13957 Py_XDECREF(it->it_seq);
13958 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013959}
13960
13961static int
13962unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13963{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013964 Py_VISIT(it->it_seq);
13965 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013966}
13967
13968static PyObject *
13969unicodeiter_next(unicodeiterobject *it)
13970{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013971 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 assert(it != NULL);
13974 seq = it->it_seq;
13975 if (seq == NULL)
13976 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013977 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013979 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13980 int kind = PyUnicode_KIND(seq);
13981 void *data = PyUnicode_DATA(seq);
13982 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13983 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 if (item != NULL)
13985 ++it->it_index;
13986 return item;
13987 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013988
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 Py_DECREF(seq);
13990 it->it_seq = NULL;
13991 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013992}
13993
13994static PyObject *
13995unicodeiter_len(unicodeiterobject *it)
13996{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013997 Py_ssize_t len = 0;
13998 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013999 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014000 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014001}
14002
14003PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14004
14005static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014008 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014009};
14010
14011PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14013 "str_iterator", /* tp_name */
14014 sizeof(unicodeiterobject), /* tp_basicsize */
14015 0, /* tp_itemsize */
14016 /* methods */
14017 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14018 0, /* tp_print */
14019 0, /* tp_getattr */
14020 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014021 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014022 0, /* tp_repr */
14023 0, /* tp_as_number */
14024 0, /* tp_as_sequence */
14025 0, /* tp_as_mapping */
14026 0, /* tp_hash */
14027 0, /* tp_call */
14028 0, /* tp_str */
14029 PyObject_GenericGetAttr, /* tp_getattro */
14030 0, /* tp_setattro */
14031 0, /* tp_as_buffer */
14032 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14033 0, /* tp_doc */
14034 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14035 0, /* tp_clear */
14036 0, /* tp_richcompare */
14037 0, /* tp_weaklistoffset */
14038 PyObject_SelfIter, /* tp_iter */
14039 (iternextfunc)unicodeiter_next, /* tp_iternext */
14040 unicodeiter_methods, /* tp_methods */
14041 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014042};
14043
14044static PyObject *
14045unicode_iter(PyObject *seq)
14046{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014048
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 if (!PyUnicode_Check(seq)) {
14050 PyErr_BadInternalCall();
14051 return NULL;
14052 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014053 if (PyUnicode_READY(seq) == -1)
14054 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14056 if (it == NULL)
14057 return NULL;
14058 it->it_index = 0;
14059 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014060 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 _PyObject_GC_TRACK(it);
14062 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014063}
14064
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014065
14066size_t
14067Py_UNICODE_strlen(const Py_UNICODE *u)
14068{
14069 int res = 0;
14070 while(*u++)
14071 res++;
14072 return res;
14073}
14074
14075Py_UNICODE*
14076Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14077{
14078 Py_UNICODE *u = s1;
14079 while ((*u++ = *s2++));
14080 return s1;
14081}
14082
14083Py_UNICODE*
14084Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14085{
14086 Py_UNICODE *u = s1;
14087 while ((*u++ = *s2++))
14088 if (n-- == 0)
14089 break;
14090 return s1;
14091}
14092
14093Py_UNICODE*
14094Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14095{
14096 Py_UNICODE *u1 = s1;
14097 u1 += Py_UNICODE_strlen(u1);
14098 Py_UNICODE_strcpy(u1, s2);
14099 return s1;
14100}
14101
14102int
14103Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14104{
14105 while (*s1 && *s2 && *s1 == *s2)
14106 s1++, s2++;
14107 if (*s1 && *s2)
14108 return (*s1 < *s2) ? -1 : +1;
14109 if (*s1)
14110 return 1;
14111 if (*s2)
14112 return -1;
14113 return 0;
14114}
14115
14116int
14117Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14118{
14119 register Py_UNICODE u1, u2;
14120 for (; n != 0; n--) {
14121 u1 = *s1;
14122 u2 = *s2;
14123 if (u1 != u2)
14124 return (u1 < u2) ? -1 : +1;
14125 if (u1 == '\0')
14126 return 0;
14127 s1++;
14128 s2++;
14129 }
14130 return 0;
14131}
14132
14133Py_UNICODE*
14134Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14135{
14136 const Py_UNICODE *p;
14137 for (p = s; *p; p++)
14138 if (*p == c)
14139 return (Py_UNICODE*)p;
14140 return NULL;
14141}
14142
14143Py_UNICODE*
14144Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14145{
14146 const Py_UNICODE *p;
14147 p = s + Py_UNICODE_strlen(s);
14148 while (p != s) {
14149 p--;
14150 if (*p == c)
14151 return (Py_UNICODE*)p;
14152 }
14153 return NULL;
14154}
Victor Stinner331ea922010-08-10 16:37:20 +000014155
Victor Stinner71133ff2010-09-01 23:43:53 +000014156Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014157PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014158{
Victor Stinner577db2c2011-10-11 22:12:48 +020014159 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014160 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014162 if (!PyUnicode_Check(unicode)) {
14163 PyErr_BadArgument();
14164 return NULL;
14165 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014166 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014167 if (u == NULL)
14168 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014169 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014170 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014171 PyErr_NoMemory();
14172 return NULL;
14173 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014174 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014175 size *= sizeof(Py_UNICODE);
14176 copy = PyMem_Malloc(size);
14177 if (copy == NULL) {
14178 PyErr_NoMemory();
14179 return NULL;
14180 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014181 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014182 return copy;
14183}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014184
Georg Brandl66c221e2010-10-14 07:04:07 +000014185/* A _string module, to export formatter_parser and formatter_field_name_split
14186 to the string.Formatter class implemented in Python. */
14187
14188static PyMethodDef _string_methods[] = {
14189 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14190 METH_O, PyDoc_STR("split the argument as a field name")},
14191 {"formatter_parser", (PyCFunction) formatter_parser,
14192 METH_O, PyDoc_STR("parse the argument as a format string")},
14193 {NULL, NULL}
14194};
14195
14196static struct PyModuleDef _string_module = {
14197 PyModuleDef_HEAD_INIT,
14198 "_string",
14199 PyDoc_STR("string helper module"),
14200 0,
14201 _string_methods,
14202 NULL,
14203 NULL,
14204 NULL,
14205 NULL
14206};
14207
14208PyMODINIT_FUNC
14209PyInit__string(void)
14210{
14211 return PyModule_Create(&_string_module);
14212}
14213
14214
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014215#ifdef __cplusplus
14216}
14217#endif