blob: 2846bd228de8cfe755df68d020fb42a2ab0943f2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinner3a50e702011-10-18 21:21:00 +0200490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
Thomas Wouters477c8d52006-05-27 19:21:47 +0000494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497 to keep things simple, we use a single bitmask, using the least 5
498 bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
Antoine Pitrouf068f942010-01-13 14:19:12 +0000502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518
Benjamin Peterson29060642009-01-31 22:14:21 +0000519#define BLOOM_LINEBREAK(ch) \
520 ((ch) < 128U ? ascii_linebreak[(ch)] : \
521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522
Alexander Belopolsky40018472011-02-26 01:02:56 +0000523Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525{
526 /* calculate simple bloom-style bitmask for a given unicode string */
527
Antoine Pitrouf068f942010-01-13 14:19:12 +0000528 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 Py_ssize_t i;
530
531 mask = 0;
532 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534
535 return mask;
536}
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#define BLOOM_MEMBER(mask, chr, str) \
539 (BLOOM(mask, chr) \
540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100588#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590/* --- Unicode Object ----------------------------------------------------- */
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596 Py_ssize_t size, Py_UCS4 ch,
597 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601 switch (kind) {
602 case PyUnicode_1BYTE_KIND:
603 {
604 Py_UCS1 ch1 = (Py_UCS1) ch;
605 if (ch1 == ch)
606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607 else
608 return -1;
609 }
610 case PyUnicode_2BYTE_KIND:
611 {
612 Py_UCS2 ch2 = (Py_UCS2) ch;
613 if (ch2 == ch)
614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615 else
616 return -1;
617 }
618 case PyUnicode_4BYTE_KIND:
619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620 default:
621 assert(0);
622 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624}
625
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629 Py_ssize_t char_size;
630 Py_ssize_t struct_size;
631 Py_ssize_t new_size;
632 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100633 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200634
635 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200636 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 if (PyUnicode_IS_COMPACT_ASCII(unicode))
638 struct_size = sizeof(PyASCIIObject);
639 else
640 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200641 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100644 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
Victor Stinner84def372011-12-11 20:04:56 +0100650 _Py_DEC_REFTOTAL;
651 _Py_ForgetReference(unicode);
652
653 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
654 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200655 PyObject_Del(unicode);
656 PyErr_NoMemory();
657 return NULL;
658 }
Victor Stinner84def372011-12-11 20:04:56 +0100659 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200663 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200664 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
666 _PyUnicode_WSTR_LENGTH(unicode) = length;
667 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
669 length, 0);
670 return unicode;
671}
672
Alexander Belopolsky40018472011-02-26 01:02:56 +0000673static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200674resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675{
Victor Stinner95663112011-10-04 01:03:50 +0200676 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000679
Victor Stinner95663112011-10-04 01:03:50 +0200680 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681
682 if (PyUnicode_IS_READY(unicode)) {
683 Py_ssize_t char_size;
684 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200685 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 void *data;
687
688 data = _PyUnicode_DATA_ANY(unicode);
689 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200690 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200691 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
692 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200693 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
694 {
695 PyObject_DEL(_PyUnicode_UTF8(unicode));
696 _PyUnicode_UTF8(unicode) = NULL;
697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
698 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699
700 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
701 PyErr_NoMemory();
702 return -1;
703 }
704 new_size = (length + 1) * char_size;
705
706 data = (PyObject *)PyObject_REALLOC(data, new_size);
707 if (data == NULL) {
708 PyErr_NoMemory();
709 return -1;
710 }
711 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200712 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 _PyUnicode_WSTR_LENGTH(unicode) = length;
715 }
716 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200717 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_UTF8_LENGTH(unicode) = length;
719 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 _PyUnicode_LENGTH(unicode) = length;
721 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200722 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200723 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 }
Victor Stinner95663112011-10-04 01:03:50 +0200727 assert(_PyUnicode_WSTR(unicode) != NULL);
728
729 /* check for integer overflow */
730 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 wstr = _PyUnicode_WSTR(unicode);
735 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
736 if (!wstr) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 _PyUnicode_WSTR(unicode) = wstr;
741 _PyUnicode_WSTR(unicode)[length] = 0;
742 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200743 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 return 0;
745}
746
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747static PyObject*
748resize_copy(PyObject *unicode, Py_ssize_t length)
749{
750 Py_ssize_t copy_length;
751 if (PyUnicode_IS_COMPACT(unicode)) {
752 PyObject *copy;
753 assert(PyUnicode_IS_READY(unicode));
754
755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
756 if (copy == NULL)
757 return NULL;
758
759 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200760 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200762 }
763 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 assert(_PyUnicode_WSTR(unicode) != NULL);
766 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200767 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (w == NULL)
769 return NULL;
770 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
771 copy_length = Py_MIN(copy_length, length);
772 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
773 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200774 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
776}
777
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000779 Ux0000 terminated; some code (e.g. new_identifier)
780 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781
782 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000783 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784
785*/
786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200788static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789#endif
790
Alexander Belopolsky40018472011-02-26 01:02:56 +0000791static PyUnicodeObject *
792_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793{
794 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798 if (length == 0 && unicode_empty != NULL) {
799 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200800 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801 }
802
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000803 /* Ensure we won't overflow the size. */
804 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
805 return (PyUnicodeObject *)PyErr_NoMemory();
806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (length < 0) {
808 PyErr_SetString(PyExc_SystemError,
809 "Negative size passed to _PyUnicode_New");
810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 }
812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813#ifdef Py_DEBUG
814 ++unicode_old_new_calls;
815#endif
816
817 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
818 if (unicode == NULL)
819 return NULL;
820 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
821 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
822 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000823 PyErr_NoMemory();
824 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826
Jeremy Hyltond8082792003-09-16 19:41:39 +0000827 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000828 * the caller fails before initializing str -- unicode_resize()
829 * reads str[0], and the Keep-Alive optimization can keep memory
830 * allocated for str alive across a call to unicode_dealloc(unicode).
831 * We don't want unicode_resize to read uninitialized memory in
832 * that case.
833 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 _PyUnicode_WSTR(unicode)[0] = 0;
835 _PyUnicode_WSTR(unicode)[length] = 0;
836 _PyUnicode_WSTR_LENGTH(unicode) = length;
837 _PyUnicode_HASH(unicode) = -1;
838 _PyUnicode_STATE(unicode).interned = 0;
839 _PyUnicode_STATE(unicode).kind = 0;
840 _PyUnicode_STATE(unicode).compact = 0;
841 _PyUnicode_STATE(unicode).ready = 0;
842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200843 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = NULL;
846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100847 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000849
Benjamin Peterson29060642009-01-31 22:14:21 +0000850 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000851 /* XXX UNREF/NEWREF interface should be more symmetrical */
852 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000853 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000854 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856}
857
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858static const char*
859unicode_kind_name(PyObject *unicode)
860{
Victor Stinner42dfd712011-10-03 14:41:45 +0200861 /* don't check consistency: unicode_kind_name() is called from
862 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863 if (!PyUnicode_IS_COMPACT(unicode))
864 {
865 if (!PyUnicode_IS_READY(unicode))
866 return "wstr";
867 switch(PyUnicode_KIND(unicode))
868 {
869 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200870 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 return "legacy ascii";
872 else
873 return "legacy latin1";
874 case PyUnicode_2BYTE_KIND:
875 return "legacy UCS2";
876 case PyUnicode_4BYTE_KIND:
877 return "legacy UCS4";
878 default:
879 return "<legacy invalid kind>";
880 }
881 }
882 assert(PyUnicode_IS_READY(unicode));
883 switch(PyUnicode_KIND(unicode))
884 {
885 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 return "ascii";
888 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200891 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 default:
895 return "<invalid compact kind>";
896 }
897}
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200900static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901
902/* Functions wrapping macros for use in debugger */
903char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905}
906
907void *_PyUnicode_compact_data(void *unicode) {
908 return _PyUnicode_COMPACT_DATA(unicode);
909}
910void *_PyUnicode_data(void *unicode){
911 printf("obj %p\n", unicode);
912 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
913 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
914 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
915 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
916 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
917 return PyUnicode_DATA(unicode);
918}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200919
920void
921_PyUnicode_Dump(PyObject *op)
922{
923 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200924 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
925 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
926 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200927
Victor Stinnera849a4b2011-10-03 12:12:11 +0200928 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200929 {
930 if (ascii->state.ascii)
931 data = (ascii + 1);
932 else
933 data = (compact + 1);
934 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 else
936 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200937 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->wstr == data)
940 printf("shared ");
941 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200942
Victor Stinnera3b334d2011-10-03 13:53:37 +0200943 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(" (%zu), ", compact->wstr_length);
945 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
946 printf("shared ");
947 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951#endif
952
953PyObject *
954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
955{
956 PyObject *obj;
957 PyCompactUnicodeObject *unicode;
958 void *data;
959 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200960 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 Py_ssize_t char_size;
962 Py_ssize_t struct_size;
963
964 /* Optimization for empty strings */
965 if (size == 0 && unicode_empty != NULL) {
966 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200967 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 }
969
970#ifdef Py_DEBUG
971 ++unicode_new_new_calls;
972#endif
973
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 is_ascii = 0;
975 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 struct_size = sizeof(PyCompactUnicodeObject);
977 if (maxchar < 128) {
978 kind_state = PyUnicode_1BYTE_KIND;
979 char_size = 1;
980 is_ascii = 1;
981 struct_size = sizeof(PyASCIIObject);
982 }
983 else if (maxchar < 256) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 }
987 else if (maxchar < 65536) {
988 kind_state = PyUnicode_2BYTE_KIND;
989 char_size = 2;
990 if (sizeof(wchar_t) == 2)
991 is_sharing = 1;
992 }
993 else {
994 kind_state = PyUnicode_4BYTE_KIND;
995 char_size = 4;
996 if (sizeof(wchar_t) == 4)
997 is_sharing = 1;
998 }
999
1000 /* Ensure we won't overflow the size. */
1001 if (size < 0) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "Negative size passed to PyUnicode_New");
1004 return NULL;
1005 }
1006 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1007 return PyErr_NoMemory();
1008
1009 /* Duplicated allocation code from _PyObject_New() instead of a call to
1010 * PyObject_New() so we are able to allocate space for the object and
1011 * it's data buffer.
1012 */
1013 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1014 if (obj == NULL)
1015 return PyErr_NoMemory();
1016 obj = PyObject_INIT(obj, &PyUnicode_Type);
1017 if (obj == NULL)
1018 return NULL;
1019
1020 unicode = (PyCompactUnicodeObject *)obj;
1021 if (is_ascii)
1022 data = ((PyASCIIObject*)obj) + 1;
1023 else
1024 data = unicode + 1;
1025 _PyUnicode_LENGTH(unicode) = size;
1026 _PyUnicode_HASH(unicode) = -1;
1027 _PyUnicode_STATE(unicode).interned = 0;
1028 _PyUnicode_STATE(unicode).kind = kind_state;
1029 _PyUnicode_STATE(unicode).compact = 1;
1030 _PyUnicode_STATE(unicode).ready = 1;
1031 _PyUnicode_STATE(unicode).ascii = is_ascii;
1032 if (is_ascii) {
1033 ((char*)data)[size] = 0;
1034 _PyUnicode_WSTR(unicode) = NULL;
1035 }
1036 else if (kind_state == PyUnicode_1BYTE_KIND) {
1037 ((char*)data)[size] = 0;
1038 _PyUnicode_WSTR(unicode) = NULL;
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001041 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 }
1043 else {
1044 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001045 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 if (kind_state == PyUnicode_2BYTE_KIND)
1047 ((Py_UCS2*)data)[size] = 0;
1048 else /* kind_state == PyUnicode_4BYTE_KIND */
1049 ((Py_UCS4*)data)[size] = 0;
1050 if (is_sharing) {
1051 _PyUnicode_WSTR_LENGTH(unicode) = size;
1052 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1053 }
1054 else {
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
1058 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001059 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return obj;
1061}
1062
1063#if SIZEOF_WCHAR_T == 2
1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1065 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001066 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067
1068 This function assumes that unicode can hold one more code point than wstr
1069 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001070static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001072 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073{
1074 const wchar_t *iter;
1075 Py_UCS4 *ucs4_out;
1076
Victor Stinner910337b2011-10-03 03:20:16 +02001077 assert(unicode != NULL);
1078 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1080 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1081
1082 for (iter = begin; iter < end; ) {
1083 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1084 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001085 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1086 && (iter+1) < end
1087 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 {
Victor Stinner551ac952011-11-29 22:58:13 +01001089 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 iter += 2;
1091 }
1092 else {
1093 *ucs4_out++ = *iter;
1094 iter++;
1095 }
1096 }
1097 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
1099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100}
1101#endif
1102
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103static int
1104_PyUnicode_Dirty(PyObject *unicode)
1105{
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001107 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001108 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001109 "Cannot modify a string having more than 1 reference");
1110 return -1;
1111 }
1112 _PyUnicode_DIRTY(unicode);
1113 return 0;
1114}
1115
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001116static int
1117_copy_characters(PyObject *to, Py_ssize_t to_start,
1118 PyObject *from, Py_ssize_t from_start,
1119 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 unsigned int from_kind, to_kind;
1122 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001123 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001125 assert(PyUnicode_Check(from));
1126 assert(PyUnicode_Check(to));
1127 assert(PyUnicode_IS_READY(from));
1128 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1131 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1132 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001134 if (how_many == 0)
1135 return 0;
1136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001140 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142#ifdef Py_DEBUG
1143 if (!check_maxchar
1144 && (from_kind > to_kind
1145 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1148 Py_UCS4 ch;
1149 Py_ssize_t i;
1150 for (i=0; i < how_many; i++) {
1151 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1152 assert(ch <= to_maxchar);
1153 }
1154 }
1155#endif
1156 fast = (from_kind == to_kind);
1157 if (check_maxchar
1158 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1159 {
1160 /* deny latin1 => ascii */
1161 fast = 0;
1162 }
1163
1164 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001165 Py_MEMCPY((char*)to_data + to_kind * to_start,
1166 (char*)from_data + from_kind * from_start,
1167 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 else if (from_kind == PyUnicode_1BYTE_KIND
1170 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001171 {
1172 _PyUnicode_CONVERT_BYTES(
1173 Py_UCS1, Py_UCS2,
1174 PyUnicode_1BYTE_DATA(from) + from_start,
1175 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1176 PyUnicode_2BYTE_DATA(to) + to_start
1177 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001178 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001180 && to_kind == PyUnicode_4BYTE_KIND)
1181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS4,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_4BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
1189 else if (from_kind == PyUnicode_2BYTE_KIND
1190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS2, Py_UCS4,
1194 PyUnicode_2BYTE_DATA(from) + from_start,
1195 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 /* check if max_char(from substring) <= max_char(to) */
1201 if (from_kind > to_kind
1202 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001203 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001204 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 /* slow path to check for character overflow */
1206 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 Py_ssize_t i;
1209
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 for (i=0; i < how_many; i++) {
1212 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001213 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#else
1217 if (!check_maxchar) {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
1222 }
1223 else {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 if (ch > to_maxchar)
1227 return 1;
1228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
1230 }
1231#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(0 && "inconsistent state");
1235 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 }
1237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 return 0;
1239}
1240
1241static void
1242copy_characters(PyObject *to, Py_ssize_t to_start,
1243 PyObject *from, Py_ssize_t from_start,
1244 Py_ssize_t how_many)
1245{
1246 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1247}
1248
1249Py_ssize_t
1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1251 PyObject *from, Py_ssize_t from_start,
1252 Py_ssize_t how_many)
1253{
1254 int err;
1255
1256 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1257 PyErr_BadInternalCall();
1258 return -1;
1259 }
1260
1261 if (PyUnicode_READY(from))
1262 return -1;
1263 if (PyUnicode_READY(to))
1264 return -1;
1265
1266 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1267 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1268 PyErr_Format(PyExc_SystemError,
1269 "Cannot write %zi characters at %zi "
1270 "in a string of %zi characters",
1271 how_many, to_start, PyUnicode_GET_LENGTH(to));
1272 return -1;
1273 }
1274
1275 if (how_many == 0)
1276 return 0;
1277
1278 if (_PyUnicode_Dirty(to))
1279 return -1;
1280
1281 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1282 if (err) {
1283 PyErr_Format(PyExc_SystemError,
1284 "Cannot copy %s characters "
1285 "into a string of %s characters",
1286 unicode_kind_name(from),
1287 unicode_kind_name(to));
1288 return -1;
1289 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001290 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291}
1292
Victor Stinner17222162011-09-28 22:15:37 +02001293/* Find the maximum code point and count the number of surrogate pairs so a
1294 correct string length can be computed before converting a string to UCS4.
1295 This function counts single surrogates as a character and not as a pair.
1296
1297 Return 0 on success, or -1 on error. */
1298static int
1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1300 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301{
1302 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001303 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304
Victor Stinnerc53be962011-10-02 21:33:54 +02001305 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 *num_surrogates = 0;
1307 *maxchar = 0;
1308
1309 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312 && (iter+1) < end
1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001315 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 iter += 2;
1318 }
1319 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 {
1322 ch = *iter;
1323 iter++;
1324 }
1325 if (ch > *maxchar) {
1326 *maxchar = ch;
1327 if (*maxchar > MAX_UNICODE) {
1328 PyErr_Format(PyExc_ValueError,
1329 "character U+%x is not in range [U+0000; U+10ffff]",
1330 ch);
1331 return -1;
1332 }
1333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 }
1335 return 0;
1336}
1337
1338#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001339static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001342int
1343_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 wchar_t *end;
1346 Py_UCS4 maxchar = 0;
1347 Py_ssize_t num_surrogates;
1348#if SIZEOF_WCHAR_T == 2
1349 Py_ssize_t length_wo_surrogates;
1350#endif
1351
Georg Brandl7597add2011-10-05 16:36:47 +02001352 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001353 strings were created using _PyObject_New() and where no canonical
1354 representation (the str field) has been set yet aka strings
1355 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001356 assert(_PyUnicode_CHECK(unicode));
1357 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001360 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001361 /* Actually, it should neither be interned nor be anything else: */
1362 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363
1364#ifdef Py_DEBUG
1365 ++unicode_ready_calls;
1366#endif
1367
1368 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001369 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001370 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372
1373 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1375 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 PyErr_NoMemory();
1377 return -1;
1378 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001379 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 _PyUnicode_WSTR(unicode), end,
1381 PyUnicode_1BYTE_DATA(unicode));
1382 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1383 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1384 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1385 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001386 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 }
1390 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8(unicode) = NULL;
1393 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 PyObject_FREE(_PyUnicode_WSTR(unicode));
1396 _PyUnicode_WSTR(unicode) = NULL;
1397 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398 }
1399 /* In this case we might have to convert down from 4-byte native
1400 wchar_t to 2-byte unicode. */
1401 else if (maxchar < 65536) {
1402 assert(num_surrogates == 0 &&
1403 "FindMaxCharAndNumSurrogatePairs() messed up");
1404
Victor Stinner506f5922011-09-28 22:34:18 +02001405#if SIZEOF_WCHAR_T == 2
1406 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001408 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001411 _PyUnicode_UTF8(unicode) = NULL;
1412 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001413#else
1414 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001416 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyErr_NoMemory();
1419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
Victor Stinner506f5922011-09-28 22:34:18 +02001421 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1422 _PyUnicode_WSTR(unicode), end,
1423 PyUnicode_2BYTE_DATA(unicode));
1424 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1425 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1426 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001427 _PyUnicode_UTF8(unicode) = NULL;
1428 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyObject_FREE(_PyUnicode_WSTR(unicode));
1430 _PyUnicode_WSTR(unicode) = NULL;
1431 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1432#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1435 else {
1436#if SIZEOF_WCHAR_T == 2
1437 /* in case the native representation is 2-bytes, we need to allocate a
1438 new normalized 4-byte version. */
1439 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1441 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 PyErr_NoMemory();
1443 return -1;
1444 }
1445 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1446 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001447 _PyUnicode_UTF8(unicode) = NULL;
1448 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001449 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1450 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001451 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyObject_FREE(_PyUnicode_WSTR(unicode));
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1455#else
1456 assert(num_surrogates == 0);
1457
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 _PyUnicode_UTF8(unicode) = NULL;
1461 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1463#endif
1464 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1465 }
1466 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001467 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 return 0;
1469}
1470
Alexander Belopolsky40018472011-02-26 01:02:56 +00001471static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473{
Walter Dörwald16807132007-05-25 13:52:07 +00001474 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 case SSTATE_NOT_INTERNED:
1476 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001477
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 case SSTATE_INTERNED_MORTAL:
1479 /* revive dead object temporarily for DelItem */
1480 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001481 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 Py_FatalError(
1483 "deletion of interned string failed");
1484 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001485
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 case SSTATE_INTERNED_IMMORTAL:
1487 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001488
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 default:
1490 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001491 }
1492
Victor Stinner03490912011-10-03 23:45:12 +02001493 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001495 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001496 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497
1498 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001499 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 }
1501 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 if (_PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001504 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
1506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (Py_REFCNT(unicode) != 1)
1529 return 0;
1530 if (PyUnicode_CHECK_INTERNED(unicode))
1531 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001532#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001533 /* singleton refcount is greater than 1 */
1534 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001535#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 return 1;
1537}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001538
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539static int
1540unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1541{
1542 PyObject *unicode;
1543 Py_ssize_t old_length;
1544
1545 assert(p_unicode != NULL);
1546 unicode = *p_unicode;
1547
1548 assert(unicode != NULL);
1549 assert(PyUnicode_Check(unicode));
1550 assert(0 <= length);
1551
Victor Stinner910337b2011-10-03 03:20:16 +02001552 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 old_length = PyUnicode_WSTR_LENGTH(unicode);
1554 else
1555 old_length = PyUnicode_GET_LENGTH(unicode);
1556 if (old_length == length)
1557 return 0;
1558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001559 if (length == 0) {
1560 Py_DECREF(*p_unicode);
1561 *p_unicode = unicode_empty;
1562 Py_INCREF(*p_unicode);
1563 return 0;
1564 }
1565
Victor Stinnerfe226c02011-10-03 03:52:20 +02001566 if (!unicode_resizable(unicode)) {
1567 PyObject *copy = resize_copy(unicode, length);
1568 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 Py_DECREF(*p_unicode);
1571 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001573 }
1574
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 if (PyUnicode_IS_COMPACT(unicode)) {
1576 *p_unicode = resize_compact(unicode, length);
1577 if (*p_unicode == NULL)
1578 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001579 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001581 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001582 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583}
1584
Alexander Belopolsky40018472011-02-26 01:02:56 +00001585int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001587{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *unicode;
1589 if (p_unicode == NULL) {
1590 PyErr_BadInternalCall();
1591 return -1;
1592 }
1593 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001594 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001600}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001603unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604{
1605 PyObject *result;
1606 assert(PyUnicode_IS_READY(*p_unicode));
1607 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1608 return 0;
1609 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1610 maxchar);
1611 if (result == NULL)
1612 return -1;
1613 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1614 PyUnicode_GET_LENGTH(*p_unicode));
1615 Py_DECREF(*p_unicode);
1616 *p_unicode = result;
1617 return 0;
1618}
1619
1620static int
1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1622 Py_UCS4 ch)
1623{
1624 if (unicode_widen(p_unicode, ch) < 0)
1625 return -1;
1626 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1627 PyUnicode_DATA(*p_unicode),
1628 (*pos)++, ch);
1629 return 0;
1630}
1631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632static PyObject*
1633get_latin1_char(unsigned char ch)
1634{
Victor Stinnera464fc12011-10-02 20:39:30 +02001635 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001637 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!unicode)
1639 return NULL;
1640 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001641 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 unicode_latin1[ch] = unicode;
1643 }
1644 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646}
1647
Alexander Belopolsky40018472011-02-26 01:02:56 +00001648PyObject *
1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001651 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 Py_UCS4 maxchar = 0;
1653 Py_ssize_t num_surrogates;
1654
1655 if (u == NULL)
1656 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 /* If the Unicode data is known at construction time, we can apply
1659 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 /* Optimization for empty strings */
1662 if (size == 0 && unicode_empty != NULL) {
1663 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001664 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001665 }
Tim Petersced69f82003-09-16 20:30:58 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Single character Unicode objects in the Latin-1 range are
1668 shared when using this constructor */
1669 if (size == 1 && *u < 256)
1670 return get_latin1_char((unsigned char)*u);
1671
1672 /* If not empty and not single character, copy the Unicode data
1673 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 if (find_maxchar_surrogates(u, u + size,
1675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return NULL;
1677
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!unicode)
1680 return NULL;
1681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 switch (PyUnicode_KIND(unicode)) {
1683 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688#if Py_UNICODE_SIZE == 2
1689 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1690#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001691 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1693#endif
1694 break;
1695 case PyUnicode_4BYTE_KIND:
1696#if SIZEOF_WCHAR_T == 2
1697 /* This is the only case which has to process surrogates, thus
1698 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001699 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700#else
1701 assert(num_surrogates == 0);
1702 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1703#endif
1704 break;
1705 default:
1706 assert(0 && "Impossible state");
1707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001709 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710}
1711
Alexander Belopolsky40018472011-02-26 01:02:56 +00001712PyObject *
1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001714{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 if (size < 0) {
1716 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 return NULL;
1719 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001720
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001721 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001722 some optimizations which share commonly used objects.
1723 Also, this means the input must be UTF-8, so fall back to the
1724 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001725 if (u != NULL) {
1726
Benjamin Peterson29060642009-01-31 22:14:21 +00001727 /* Optimization for empty strings */
1728 if (size == 0 && unicode_empty != NULL) {
1729 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001732
1733 /* Single characters are shared when using this constructor.
1734 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001735 if (size == 1 && (unsigned char)*u < 128)
1736 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001737
1738 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001739 }
1740
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001741 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001742}
1743
Alexander Belopolsky40018472011-02-26 01:02:56 +00001744PyObject *
1745PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001746{
1747 size_t size = strlen(u);
1748 if (size > PY_SSIZE_T_MAX) {
1749 PyErr_SetString(PyExc_OverflowError, "input too long");
1750 return NULL;
1751 }
1752
1753 return PyUnicode_FromStringAndSize(u, size);
1754}
1755
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001756PyObject *
1757_PyUnicode_FromId(_Py_Identifier *id)
1758{
1759 if (!id->object) {
1760 id->object = PyUnicode_FromString(id->string);
1761 if (!id->object)
1762 return NULL;
1763 PyUnicode_InternInPlace(&id->object);
1764 assert(!id->next);
1765 id->next = static_strings;
1766 static_strings = id;
1767 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001768 return id->object;
1769}
1770
1771void
1772_PyUnicode_ClearStaticStrings()
1773{
1774 _Py_Identifier *i;
1775 for (i = static_strings; i; i = i->next) {
1776 Py_DECREF(i->object);
1777 i->object = NULL;
1778 i->next = NULL;
1779 }
1780}
1781
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001782/* Internal function, don't check maximum character */
1783
Victor Stinnere57b1c02011-09-28 22:20:48 +02001784static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001785unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001786{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001787 PyObject *res;
1788#ifdef Py_DEBUG
1789 const unsigned char *p;
1790 const unsigned char *end = s + size;
1791 for (p=s; p < end; p++) {
1792 assert(*p < 128);
1793 }
1794#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001795 if (size == 1)
1796 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001797 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001798 if (!res)
1799 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001800 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001801 return res;
1802}
1803
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804static Py_UCS4
1805kind_maxchar_limit(unsigned int kind)
1806{
1807 switch(kind) {
1808 case PyUnicode_1BYTE_KIND:
1809 return 0x80;
1810 case PyUnicode_2BYTE_KIND:
1811 return 0x100;
1812 case PyUnicode_4BYTE_KIND:
1813 return 0x10000;
1814 default:
1815 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001816 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001817 }
1818}
1819
Victor Stinner702c7342011-10-05 13:50:52 +02001820static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001821_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001822{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001824 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001825
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001826 if (size == 0) {
1827 Py_INCREF(unicode_empty);
1828 return unicode_empty;
1829 }
1830 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001831 if (size == 1)
1832 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001833
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001834 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 if (!res)
1837 return NULL;
1838 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001839 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001841}
1842
Victor Stinnere57b1c02011-09-28 22:20:48 +02001843static PyObject*
1844_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845{
1846 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001847 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001849 if (size == 0) {
1850 Py_INCREF(unicode_empty);
1851 return unicode_empty;
1852 }
1853 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001854 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001855 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001856
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001857 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001858 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001859 if (!res)
1860 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001861 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001863 else {
1864 _PyUnicode_CONVERT_BYTES(
1865 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1866 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001867 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 return res;
1869}
1870
Victor Stinnere57b1c02011-09-28 22:20:48 +02001871static PyObject*
1872_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873{
1874 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001875 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001876
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001877 if (size == 0) {
1878 Py_INCREF(unicode_empty);
1879 return unicode_empty;
1880 }
1881 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001882 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001883 return get_latin1_char((unsigned char)u[0]);
1884
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001885 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001886 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001887 if (!res)
1888 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001889 if (max_char < 256)
1890 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1891 PyUnicode_1BYTE_DATA(res));
1892 else if (max_char < 0x10000)
1893 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1894 PyUnicode_2BYTE_DATA(res));
1895 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001897 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 return res;
1899}
1900
1901PyObject*
1902PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1903{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001904 if (size < 0) {
1905 PyErr_SetString(PyExc_ValueError, "size must be positive");
1906 return NULL;
1907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 switch(kind) {
1909 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001912 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001914 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916 PyErr_SetString(PyExc_SystemError, "invalid kind");
1917 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919}
1920
Victor Stinner25a4b292011-10-06 12:31:55 +02001921/* Ensure that a string uses the most efficient storage, if it is not the
1922 case: create a new string with of the right kind. Write NULL into *p_unicode
1923 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001924static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001925unicode_adjust_maxchar(PyObject **p_unicode)
1926{
1927 PyObject *unicode, *copy;
1928 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001930 unsigned int kind;
1931
1932 assert(p_unicode != NULL);
1933 unicode = *p_unicode;
1934 assert(PyUnicode_IS_READY(unicode));
1935 if (PyUnicode_IS_ASCII(unicode))
1936 return;
1937
1938 len = PyUnicode_GET_LENGTH(unicode);
1939 kind = PyUnicode_KIND(unicode);
1940 if (kind == PyUnicode_1BYTE_KIND) {
1941 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs1lib_find_max_char(u, u + len);
1943 if (max_char >= 128)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
1946 else if (kind == PyUnicode_2BYTE_KIND) {
1947 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001948 max_char = ucs2lib_find_max_char(u, u + len);
1949 if (max_char >= 256)
1950 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001951 }
1952 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001954 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001955 max_char = ucs4lib_find_max_char(u, u + len);
1956 if (max_char >= 0x10000)
1957 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001958 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 copy = PyUnicode_New(len, max_char);
1960 copy_characters(copy, 0, unicode, 0, len);
1961 Py_DECREF(unicode);
1962 *p_unicode = copy;
1963}
1964
Victor Stinner034f6cf2011-09-30 02:26:44 +02001965PyObject*
1966PyUnicode_Copy(PyObject *unicode)
1967{
Victor Stinner87af4f22011-11-21 23:03:47 +01001968 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001969 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001970
Victor Stinner034f6cf2011-09-30 02:26:44 +02001971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadInternalCall();
1973 return NULL;
1974 }
1975 if (PyUnicode_READY(unicode))
1976 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001977
Victor Stinner87af4f22011-11-21 23:03:47 +01001978 length = PyUnicode_GET_LENGTH(unicode);
1979 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001980 if (!copy)
1981 return NULL;
1982 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1983
Victor Stinner87af4f22011-11-21 23:03:47 +01001984 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1985 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001986 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001987 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001988}
1989
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990
Victor Stinnerbc603d12011-10-02 01:00:40 +02001991/* Widen Unicode objects to larger buffers. Don't write terminating null
1992 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993
1994void*
1995_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1996{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001997 Py_ssize_t len;
1998 void *result;
1999 unsigned int skind;
2000
2001 if (PyUnicode_READY(s))
2002 return NULL;
2003
2004 len = PyUnicode_GET_LENGTH(s);
2005 skind = PyUnicode_KIND(s);
2006 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002007 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return NULL;
2009 }
2010 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002011 case PyUnicode_2BYTE_KIND:
2012 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2013 if (!result)
2014 return PyErr_NoMemory();
2015 assert(skind == PyUnicode_1BYTE_KIND);
2016 _PyUnicode_CONVERT_BYTES(
2017 Py_UCS1, Py_UCS2,
2018 PyUnicode_1BYTE_DATA(s),
2019 PyUnicode_1BYTE_DATA(s) + len,
2020 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002022 case PyUnicode_4BYTE_KIND:
2023 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2024 if (!result)
2025 return PyErr_NoMemory();
2026 if (skind == PyUnicode_2BYTE_KIND) {
2027 _PyUnicode_CONVERT_BYTES(
2028 Py_UCS2, Py_UCS4,
2029 PyUnicode_2BYTE_DATA(s),
2030 PyUnicode_2BYTE_DATA(s) + len,
2031 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002033 else {
2034 assert(skind == PyUnicode_1BYTE_KIND);
2035 _PyUnicode_CONVERT_BYTES(
2036 Py_UCS1, Py_UCS4,
2037 PyUnicode_1BYTE_DATA(s),
2038 PyUnicode_1BYTE_DATA(s) + len,
2039 result);
2040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002042 default:
2043 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 }
Victor Stinner01698042011-10-04 00:04:26 +02002045 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return NULL;
2047}
2048
2049static Py_UCS4*
2050as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2051 int copy_null)
2052{
2053 int kind;
2054 void *data;
2055 Py_ssize_t len, targetlen;
2056 if (PyUnicode_READY(string) == -1)
2057 return NULL;
2058 kind = PyUnicode_KIND(string);
2059 data = PyUnicode_DATA(string);
2060 len = PyUnicode_GET_LENGTH(string);
2061 targetlen = len;
2062 if (copy_null)
2063 targetlen++;
2064 if (!target) {
2065 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2066 PyErr_NoMemory();
2067 return NULL;
2068 }
2069 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2070 if (!target) {
2071 PyErr_NoMemory();
2072 return NULL;
2073 }
2074 }
2075 else {
2076 if (targetsize < targetlen) {
2077 PyErr_Format(PyExc_SystemError,
2078 "string is longer than the buffer");
2079 if (copy_null && 0 < targetsize)
2080 target[0] = 0;
2081 return NULL;
2082 }
2083 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002084 if (kind == PyUnicode_1BYTE_KIND) {
2085 Py_UCS1 *start = (Py_UCS1 *) data;
2086 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002088 else if (kind == PyUnicode_2BYTE_KIND) {
2089 Py_UCS2 *start = (Py_UCS2 *) data;
2090 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2091 }
2092 else {
2093 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 if (copy_null)
2097 target[len] = 0;
2098 return target;
2099}
2100
2101Py_UCS4*
2102PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2103 int copy_null)
2104{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002105 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 PyErr_BadInternalCall();
2107 return NULL;
2108 }
2109 return as_ucs4(string, target, targetsize, copy_null);
2110}
2111
2112Py_UCS4*
2113PyUnicode_AsUCS4Copy(PyObject *string)
2114{
2115 return as_ucs4(string, NULL, 0, 1);
2116}
2117
2118#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002119
Alexander Belopolsky40018472011-02-26 01:02:56 +00002120PyObject *
2121PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002124 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002126 PyErr_BadInternalCall();
2127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 }
2129
Martin v. Löwis790465f2008-04-05 20:41:37 +00002130 if (size == -1) {
2131 size = wcslen(w);
2132 }
2133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135}
2136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002138
Walter Dörwald346737f2007-05-31 10:44:43 +00002139static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002140makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2141 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002142{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002143 *fmt++ = '%';
2144 if (width) {
2145 if (zeropad)
2146 *fmt++ = '0';
2147 fmt += sprintf(fmt, "%d", width);
2148 }
2149 if (precision)
2150 fmt += sprintf(fmt, ".%d", precision);
2151 if (longflag)
2152 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002153 else if (longlongflag) {
2154 /* longlongflag should only ever be nonzero on machines with
2155 HAVE_LONG_LONG defined */
2156#ifdef HAVE_LONG_LONG
2157 char *f = PY_FORMAT_LONG_LONG;
2158 while (*f)
2159 *fmt++ = *f++;
2160#else
2161 /* we shouldn't ever get here */
2162 assert(0);
2163 *fmt++ = 'l';
2164#endif
2165 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 else if (size_tflag) {
2167 char *f = PY_FORMAT_SIZE_T;
2168 while (*f)
2169 *fmt++ = *f++;
2170 }
2171 *fmt++ = c;
2172 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002173}
2174
Victor Stinner96865452011-03-01 23:44:09 +00002175/* helper for PyUnicode_FromFormatV() */
2176
2177static const char*
2178parse_format_flags(const char *f,
2179 int *p_width, int *p_precision,
2180 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2181{
2182 int width, precision, longflag, longlongflag, size_tflag;
2183
2184 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2185 f++;
2186 width = 0;
2187 while (Py_ISDIGIT((unsigned)*f))
2188 width = (width*10) + *f++ - '0';
2189 precision = 0;
2190 if (*f == '.') {
2191 f++;
2192 while (Py_ISDIGIT((unsigned)*f))
2193 precision = (precision*10) + *f++ - '0';
2194 if (*f == '%') {
2195 /* "%.3%s" => f points to "3" */
2196 f--;
2197 }
2198 }
2199 if (*f == '\0') {
2200 /* bogus format "%.1" => go backward, f points to "1" */
2201 f--;
2202 }
2203 if (p_width != NULL)
2204 *p_width = width;
2205 if (p_precision != NULL)
2206 *p_precision = precision;
2207
2208 /* Handle %ld, %lu, %lld and %llu. */
2209 longflag = 0;
2210 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002211 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002212
2213 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002214 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002215 longflag = 1;
2216 ++f;
2217 }
2218#ifdef HAVE_LONG_LONG
2219 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002220 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002221 longlongflag = 1;
2222 f += 2;
2223 }
2224#endif
2225 }
2226 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002227 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002228 size_tflag = 1;
2229 ++f;
2230 }
2231 if (p_longflag != NULL)
2232 *p_longflag = longflag;
2233 if (p_longlongflag != NULL)
2234 *p_longlongflag = longlongflag;
2235 if (p_size_tflag != NULL)
2236 *p_size_tflag = size_tflag;
2237 return f;
2238}
2239
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002240/* maximum number of characters required for output of %ld. 21 characters
2241 allows for 64-bit integers (in decimal) and an optional sign. */
2242#define MAX_LONG_CHARS 21
2243/* maximum number of characters required for output of %lld.
2244 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2245 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2246#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2247
Walter Dörwaldd2034312007-05-18 16:29:38 +00002248PyObject *
2249PyUnicode_FromFormatV(const char *format, va_list vargs)
2250{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 va_list count;
2252 Py_ssize_t callcount = 0;
2253 PyObject **callresults = NULL;
2254 PyObject **callresult = NULL;
2255 Py_ssize_t n = 0;
2256 int width = 0;
2257 int precision = 0;
2258 int zeropad;
2259 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002260 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002261 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002262 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2264 Py_UCS4 argmaxchar;
2265 Py_ssize_t numbersize = 0;
2266 char *numberresults = NULL;
2267 char *numberresult = NULL;
2268 Py_ssize_t i;
2269 int kind;
2270 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002271
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002272 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002273 /* step 1: count the number of %S/%R/%A/%s format specifications
2274 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2275 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002277 * also estimate a upper bound for all the number formats in the string,
2278 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 for (f = format; *f; f++) {
2281 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002282 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2284 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2285 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2286 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002289#ifdef HAVE_LONG_LONG
2290 if (longlongflag) {
2291 if (width < MAX_LONG_LONG_CHARS)
2292 width = MAX_LONG_LONG_CHARS;
2293 }
2294 else
2295#endif
2296 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2297 including sign. Decimal takes the most space. This
2298 isn't enough for octal. If a width is specified we
2299 need more (which we allocate later). */
2300 if (width < MAX_LONG_CHARS)
2301 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302
2303 /* account for the size + '\0' to separate numbers
2304 inside of the numberresults buffer */
2305 numbersize += (width + 1);
2306 }
2307 }
2308 else if ((unsigned char)*f > 127) {
2309 PyErr_Format(PyExc_ValueError,
2310 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2311 "string, got a non-ASCII byte: 0x%02x",
2312 (unsigned char)*f);
2313 return NULL;
2314 }
2315 }
2316 /* step 2: allocate memory for the results of
2317 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2318 if (callcount) {
2319 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2320 if (!callresults) {
2321 PyErr_NoMemory();
2322 return NULL;
2323 }
2324 callresult = callresults;
2325 }
2326 /* step 2.5: allocate memory for the results of formating numbers */
2327 if (numbersize) {
2328 numberresults = PyObject_Malloc(numbersize);
2329 if (!numberresults) {
2330 PyErr_NoMemory();
2331 goto fail;
2332 }
2333 numberresult = numberresults;
2334 }
2335
2336 /* step 3: format numbers and figure out how large a buffer we need */
2337 for (f = format; *f; f++) {
2338 if (*f == '%') {
2339 const char* p;
2340 int longflag;
2341 int longlongflag;
2342 int size_tflag;
2343 int numprinted;
2344
2345 p = f;
2346 zeropad = (f[1] == '0');
2347 f = parse_format_flags(f, &width, &precision,
2348 &longflag, &longlongflag, &size_tflag);
2349 switch (*f) {
2350 case 'c':
2351 {
2352 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002353 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 n++;
2355 break;
2356 }
2357 case '%':
2358 n++;
2359 break;
2360 case 'i':
2361 case 'd':
2362 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2363 width, precision, *f);
2364 if (longflag)
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, long));
2367#ifdef HAVE_LONG_LONG
2368 else if (longlongflag)
2369 numprinted = sprintf(numberresult, fmt,
2370 va_arg(count, PY_LONG_LONG));
2371#endif
2372 else if (size_tflag)
2373 numprinted = sprintf(numberresult, fmt,
2374 va_arg(count, Py_ssize_t));
2375 else
2376 numprinted = sprintf(numberresult, fmt,
2377 va_arg(count, int));
2378 n += numprinted;
2379 /* advance by +1 to skip over the '\0' */
2380 numberresult += (numprinted + 1);
2381 assert(*(numberresult - 1) == '\0');
2382 assert(*(numberresult - 2) != '\0');
2383 assert(numprinted >= 0);
2384 assert(numberresult <= numberresults + numbersize);
2385 break;
2386 case 'u':
2387 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2388 width, precision, 'u');
2389 if (longflag)
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned long));
2392#ifdef HAVE_LONG_LONG
2393 else if (longlongflag)
2394 numprinted = sprintf(numberresult, fmt,
2395 va_arg(count, unsigned PY_LONG_LONG));
2396#endif
2397 else if (size_tflag)
2398 numprinted = sprintf(numberresult, fmt,
2399 va_arg(count, size_t));
2400 else
2401 numprinted = sprintf(numberresult, fmt,
2402 va_arg(count, unsigned int));
2403 n += numprinted;
2404 numberresult += (numprinted + 1);
2405 assert(*(numberresult - 1) == '\0');
2406 assert(*(numberresult - 2) != '\0');
2407 assert(numprinted >= 0);
2408 assert(numberresult <= numberresults + numbersize);
2409 break;
2410 case 'x':
2411 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2412 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2413 n += numprinted;
2414 numberresult += (numprinted + 1);
2415 assert(*(numberresult - 1) == '\0');
2416 assert(*(numberresult - 2) != '\0');
2417 assert(numprinted >= 0);
2418 assert(numberresult <= numberresults + numbersize);
2419 break;
2420 case 'p':
2421 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2422 /* %p is ill-defined: ensure leading 0x. */
2423 if (numberresult[1] == 'X')
2424 numberresult[1] = 'x';
2425 else if (numberresult[1] != 'x') {
2426 memmove(numberresult + 2, numberresult,
2427 strlen(numberresult) + 1);
2428 numberresult[0] = '0';
2429 numberresult[1] = 'x';
2430 numprinted += 2;
2431 }
2432 n += numprinted;
2433 numberresult += (numprinted + 1);
2434 assert(*(numberresult - 1) == '\0');
2435 assert(*(numberresult - 2) != '\0');
2436 assert(numprinted >= 0);
2437 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 break;
2439 case 's':
2440 {
2441 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002442 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002443 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2444 if (!str)
2445 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 /* since PyUnicode_DecodeUTF8 returns already flexible
2447 unicode objects, there is no need to call ready on them */
2448 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002449 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002451 /* Remember the str and switch to the next slot */
2452 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'U':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002458 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 if (PyUnicode_READY(obj) == -1)
2460 goto fail;
2461 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002462 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002464 break;
2465 }
2466 case 'V':
2467 {
2468 PyObject *obj = va_arg(count, PyObject *);
2469 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002470 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002471 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002472 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002473 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 if (PyUnicode_READY(obj) == -1)
2475 goto fail;
2476 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002477 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002479 *callresult++ = NULL;
2480 }
2481 else {
2482 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2483 if (!str_obj)
2484 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002485 if (PyUnicode_READY(str_obj)) {
2486 Py_DECREF(str_obj);
2487 goto fail;
2488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002490 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002492 *callresult++ = str_obj;
2493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 break;
2495 }
2496 case 'S':
2497 {
2498 PyObject *obj = va_arg(count, PyObject *);
2499 PyObject *str;
2500 assert(obj);
2501 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002505 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 /* Remember the str and switch to the next slot */
2508 *callresult++ = str;
2509 break;
2510 }
2511 case 'R':
2512 {
2513 PyObject *obj = va_arg(count, PyObject *);
2514 PyObject *repr;
2515 assert(obj);
2516 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002520 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 /* Remember the repr and switch to the next slot */
2523 *callresult++ = repr;
2524 break;
2525 }
2526 case 'A':
2527 {
2528 PyObject *obj = va_arg(count, PyObject *);
2529 PyObject *ascii;
2530 assert(obj);
2531 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002533 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002535 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 /* Remember the repr and switch to the next slot */
2538 *callresult++ = ascii;
2539 break;
2540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002541 default:
2542 /* if we stumble upon an unknown
2543 formatting code, copy the rest of
2544 the format string to the output
2545 string. (we cannot just skip the
2546 code, since there's no way to know
2547 what's in the argument list) */
2548 n += strlen(p);
2549 goto expand;
2550 }
2551 } else
2552 n++;
2553 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002555 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 we don't have to resize the string.
2558 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002559 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 if (!string)
2561 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 kind = PyUnicode_KIND(string);
2563 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002564 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002569 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002570
2571 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2573 /* checking for == because the last argument could be a empty
2574 string, which causes i to point to end, the assert at the end of
2575 the loop */
2576 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002577
Benjamin Peterson14339b62009-01-31 16:36:08 +00002578 switch (*f) {
2579 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002580 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002581 const int ordinal = va_arg(vargs, int);
2582 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002584 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002585 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002587 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 case 'p':
2590 /* unused, since we already have the result */
2591 if (*f == 'p')
2592 (void) va_arg(vargs, void *);
2593 else
2594 (void) va_arg(vargs, int);
2595 /* extract the result from numberresults and append. */
2596 for (; *numberresult; ++i, ++numberresult)
2597 PyUnicode_WRITE(kind, data, i, *numberresult);
2598 /* skip over the separating '\0' */
2599 assert(*numberresult == '\0');
2600 numberresult++;
2601 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 break;
2603 case 's':
2604 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002605 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 size = PyUnicode_GET_LENGTH(*callresult);
2609 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002610 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 /* We're done with the unicode()/repr() => forget it */
2613 Py_DECREF(*callresult);
2614 /* switch to next unicode()/repr() result */
2615 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 break;
2617 }
2618 case 'U':
2619 {
2620 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 Py_ssize_t size;
2622 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2623 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002624 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 break;
2627 }
2628 case 'V':
2629 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002632 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 size = PyUnicode_GET_LENGTH(obj);
2635 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(*callresult);
2640 assert(PyUnicode_KIND(*callresult) <=
2641 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002643 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002644 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002645 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002646 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 break;
2648 }
2649 case 'S':
2650 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002651 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002653 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 /* unused, since we already have the result */
2655 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002657 copy_characters(string, i, *callresult, 0, size);
2658 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* We're done with the unicode()/repr() => forget it */
2660 Py_DECREF(*callresult);
2661 /* switch to next unicode()/repr() result */
2662 ++callresult;
2663 break;
2664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002665 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002666 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 break;
2668 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 for (; *p; ++p, ++i)
2670 PyUnicode_WRITE(kind, data, i, *p);
2671 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 goto end;
2673 }
Victor Stinner1205f272010-09-11 00:54:47 +00002674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 else {
2676 assert(i < PyUnicode_GET_LENGTH(string));
2677 PyUnicode_WRITE(kind, data, i++, *f);
2678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 if (callresults)
2684 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 if (numberresults)
2686 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002687 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 if (callresults) {
2690 PyObject **callresult2 = callresults;
2691 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002692 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 ++callresult2;
2694 }
2695 PyObject_Free(callresults);
2696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 if (numberresults)
2698 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002700}
2701
Walter Dörwaldd2034312007-05-18 16:29:38 +00002702PyObject *
2703PyUnicode_FromFormat(const char *format, ...)
2704{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 PyObject* ret;
2706 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707
2708#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 ret = PyUnicode_FromFormatV(format, vargs);
2714 va_end(vargs);
2715 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716}
2717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718#ifdef HAVE_WCHAR_H
2719
Victor Stinner5593d8a2010-10-02 11:11:27 +00002720/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2721 convert a Unicode object to a wide character string.
2722
Victor Stinnerd88d9832011-09-06 02:00:05 +02002723 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 character) required to convert the unicode object. Ignore size argument.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002730unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002731 wchar_t *w,
2732 Py_ssize_t size)
2733{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 const wchar_t *wstr;
2736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002737 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 if (wstr == NULL)
2739 return -1;
2740
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (size > res)
2743 size = res + 1;
2744 else
2745 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 return res;
2748 }
2749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002751}
2752
2753Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002754PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002755 wchar_t *w,
2756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757{
2758 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 PyErr_BadInternalCall();
2760 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002762 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763}
2764
Victor Stinner137c34c2010-09-29 10:25:54 +00002765wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002766PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002767 Py_ssize_t *size)
2768{
2769 wchar_t* buffer;
2770 Py_ssize_t buflen;
2771
2772 if (unicode == NULL) {
2773 PyErr_BadInternalCall();
2774 return NULL;
2775 }
2776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 if (buflen == -1)
2779 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 PyErr_NoMemory();
2782 return NULL;
2783 }
2784
Victor Stinner137c34c2010-09-29 10:25:54 +00002785 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2786 if (buffer == NULL) {
2787 PyErr_NoMemory();
2788 return NULL;
2789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002790 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 if (buflen == -1)
2792 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002793 if (size != NULL)
2794 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 return buffer;
2796}
2797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799
Alexander Belopolsky40018472011-02-26 01:02:56 +00002800PyObject *
2801PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002804 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 PyErr_SetString(PyExc_ValueError,
2806 "chr() arg not in range(0x110000)");
2807 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 if (ordinal < 256)
2811 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 v = PyUnicode_New(1, ordinal);
2814 if (v == NULL)
2815 return NULL;
2816 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002817 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002819}
2820
Alexander Belopolsky40018472011-02-26 01:02:56 +00002821PyObject *
2822PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002823{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002824 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002826 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002827 if (PyUnicode_READY(obj))
2828 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 Py_INCREF(obj);
2830 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 }
2832 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 /* For a Unicode subtype that's not a Unicode object,
2834 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002835 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002837 PyErr_Format(PyExc_TypeError,
2838 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002839 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002841}
2842
Alexander Belopolsky40018472011-02-26 01:02:56 +00002843PyObject *
2844PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002845 const char *encoding,
2846 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002847{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002848 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 PyErr_BadInternalCall();
2853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002855
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002856 /* Decoding bytes objects is the most common case and should be fast */
2857 if (PyBytes_Check(obj)) {
2858 if (PyBytes_GET_SIZE(obj) == 0) {
2859 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002860 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 }
2862 else {
2863 v = PyUnicode_Decode(
2864 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2865 encoding, errors);
2866 }
2867 return v;
2868 }
2869
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002870 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 PyErr_SetString(PyExc_TypeError,
2872 "decoding str is not supported");
2873 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002876 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2877 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2878 PyErr_Format(PyExc_TypeError,
2879 "coercing to str: need bytes, bytearray "
2880 "or buffer-like object, %.80s found",
2881 Py_TYPE(obj)->tp_name);
2882 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002883 }
Tim Petersced69f82003-09-16 20:30:58 +00002884
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002885 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002887 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002891
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002893 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894}
2895
Victor Stinner600d3be2010-06-10 12:00:55 +00002896/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002897 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2898 1 on success. */
2899static int
2900normalize_encoding(const char *encoding,
2901 char *lower,
2902 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002904 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002905 char *l;
2906 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002908 if (encoding == NULL) {
2909 strcpy(lower, "utf-8");
2910 return 1;
2911 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002912 e = encoding;
2913 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002914 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002915 while (*e) {
2916 if (l == l_end)
2917 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002918 if (Py_ISUPPER(*e)) {
2919 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002920 }
2921 else if (*e == '_') {
2922 *l++ = '-';
2923 e++;
2924 }
2925 else {
2926 *l++ = *e++;
2927 }
2928 }
2929 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002930 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002931}
2932
Alexander Belopolsky40018472011-02-26 01:02:56 +00002933PyObject *
2934PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002935 Py_ssize_t size,
2936 const char *encoding,
2937 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002938{
2939 PyObject *buffer = NULL, *unicode;
2940 Py_buffer info;
2941 char lower[11]; /* Enough for any encoding shortcut */
2942
Fred Drakee4315f52000-05-09 19:53:39 +00002943 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002944 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002945 if ((strcmp(lower, "utf-8") == 0) ||
2946 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002947 return PyUnicode_DecodeUTF8(s, size, errors);
2948 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002949 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002950 (strcmp(lower, "iso-8859-1") == 0))
2951 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002952#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002953 else if (strcmp(lower, "mbcs") == 0)
2954 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002955#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "ascii") == 0)
2957 return PyUnicode_DecodeASCII(s, size, errors);
2958 else if (strcmp(lower, "utf-16") == 0)
2959 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2960 else if (strcmp(lower, "utf-32") == 0)
2961 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963
2964 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002965 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002966 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002967 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002968 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 if (buffer == NULL)
2970 goto onError;
2971 unicode = PyCodec_Decode(buffer, encoding, errors);
2972 if (unicode == NULL)
2973 goto onError;
2974 if (!PyUnicode_Check(unicode)) {
2975 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002976 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002977 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 Py_DECREF(unicode);
2979 goto onError;
2980 }
2981 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002982 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002983
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_XDECREF(buffer);
2986 return NULL;
2987}
2988
Alexander Belopolsky40018472011-02-26 01:02:56 +00002989PyObject *
2990PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002991 const char *encoding,
2992 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002993{
2994 PyObject *v;
2995
2996 if (!PyUnicode_Check(unicode)) {
2997 PyErr_BadArgument();
2998 goto onError;
2999 }
3000
3001 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003003
3004 /* Decode via the codec registry */
3005 v = PyCodec_Decode(unicode, encoding, errors);
3006 if (v == NULL)
3007 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003011 return NULL;
3012}
3013
Alexander Belopolsky40018472011-02-26 01:02:56 +00003014PyObject *
3015PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003016 const char *encoding,
3017 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003018{
3019 PyObject *v;
3020
3021 if (!PyUnicode_Check(unicode)) {
3022 PyErr_BadArgument();
3023 goto onError;
3024 }
3025
3026 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003027 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003028
3029 /* Decode via the codec registry */
3030 v = PyCodec_Decode(unicode, encoding, errors);
3031 if (v == NULL)
3032 goto onError;
3033 if (!PyUnicode_Check(v)) {
3034 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003035 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003036 Py_TYPE(v)->tp_name);
3037 Py_DECREF(v);
3038 goto onError;
3039 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003040 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041
Benjamin Peterson29060642009-01-31 22:14:21 +00003042 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003043 return NULL;
3044}
3045
Alexander Belopolsky40018472011-02-26 01:02:56 +00003046PyObject *
3047PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003048 Py_ssize_t size,
3049 const char *encoding,
3050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051{
3052 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003053
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 unicode = PyUnicode_FromUnicode(s, size);
3055 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3058 Py_DECREF(unicode);
3059 return v;
3060}
3061
Alexander Belopolsky40018472011-02-26 01:02:56 +00003062PyObject *
3063PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003064 const char *encoding,
3065 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003066{
3067 PyObject *v;
3068
3069 if (!PyUnicode_Check(unicode)) {
3070 PyErr_BadArgument();
3071 goto onError;
3072 }
3073
3074 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003076
3077 /* Encode via the codec registry */
3078 v = PyCodec_Encode(unicode, encoding, errors);
3079 if (v == NULL)
3080 goto onError;
3081 return v;
3082
Benjamin Peterson29060642009-01-31 22:14:21 +00003083 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003084 return NULL;
3085}
3086
Victor Stinnerad158722010-10-27 00:25:46 +00003087PyObject *
3088PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003089{
Victor Stinner99b95382011-07-04 14:23:54 +02003090#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003091 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003092#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003093 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003094#else
Victor Stinner793b5312011-04-27 00:24:21 +02003095 PyInterpreterState *interp = PyThreadState_GET()->interp;
3096 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3097 cannot use it to encode and decode filenames before it is loaded. Load
3098 the Python codec requires to encode at least its own filename. Use the C
3099 version of the locale codec until the codec registry is initialized and
3100 the Python codec is loaded.
3101
3102 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3103 cannot only rely on it: check also interp->fscodec_initialized for
3104 subinterpreters. */
3105 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003106 return PyUnicode_AsEncodedString(unicode,
3107 Py_FileSystemDefaultEncoding,
3108 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003109 }
3110 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003111 /* locale encoding with surrogateescape */
3112 wchar_t *wchar;
3113 char *bytes;
3114 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003115 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003116
3117 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3118 if (wchar == NULL)
3119 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 bytes = _Py_wchar2char(wchar, &error_pos);
3121 if (bytes == NULL) {
3122 if (error_pos != (size_t)-1) {
3123 char *errmsg = strerror(errno);
3124 PyObject *exc = NULL;
3125 if (errmsg == NULL)
3126 errmsg = "Py_wchar2char() failed";
3127 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003128 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003129 error_pos, error_pos+1,
3130 errmsg);
3131 Py_XDECREF(exc);
3132 }
3133 else
3134 PyErr_NoMemory();
3135 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003136 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003137 }
3138 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003139
3140 bytes_obj = PyBytes_FromString(bytes);
3141 PyMem_Free(bytes);
3142 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003143 }
Victor Stinnerad158722010-10-27 00:25:46 +00003144#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151{
3152 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003153 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003154
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 if (!PyUnicode_Check(unicode)) {
3156 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 }
Fred Drakee4315f52000-05-09 19:53:39 +00003159
Fred Drakee4315f52000-05-09 19:53:39 +00003160 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003161 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003162 if ((strcmp(lower, "utf-8") == 0) ||
3163 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003164 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003165 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003166 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003167 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003169 }
Victor Stinner37296e82010-06-10 13:36:23 +00003170 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003171 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003172 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003174#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003175 else if (strcmp(lower, "mbcs") == 0)
3176 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003177#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003178 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003179 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
3182 /* Encode via the codec registry */
3183 v = PyCodec_Encode(unicode, encoding, errors);
3184 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003185 return NULL;
3186
3187 /* The normal path */
3188 if (PyBytes_Check(v))
3189 return v;
3190
3191 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003193 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003194 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003195
3196 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3197 "encoder %s returned bytearray instead of bytes",
3198 encoding);
3199 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003200 Py_DECREF(v);
3201 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003204 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3205 Py_DECREF(v);
3206 return b;
3207 }
3208
3209 PyErr_Format(PyExc_TypeError,
3210 "encoder did not return a bytes object (type=%.400s)",
3211 Py_TYPE(v)->tp_name);
3212 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003213 return NULL;
3214}
3215
Alexander Belopolsky40018472011-02-26 01:02:56 +00003216PyObject *
3217PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003218 const char *encoding,
3219 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003220{
3221 PyObject *v;
3222
3223 if (!PyUnicode_Check(unicode)) {
3224 PyErr_BadArgument();
3225 goto onError;
3226 }
3227
3228 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003230
3231 /* Encode via the codec registry */
3232 v = PyCodec_Encode(unicode, encoding, errors);
3233 if (v == NULL)
3234 goto onError;
3235 if (!PyUnicode_Check(v)) {
3236 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003237 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003238 Py_TYPE(v)->tp_name);
3239 Py_DECREF(v);
3240 goto onError;
3241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 return NULL;
3246}
3247
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003249PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003251 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3252}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253
Christian Heimes5894ba72007-11-04 11:43:14 +00003254PyObject*
3255PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3256{
Victor Stinner99b95382011-07-04 14:23:54 +02003257#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003258 return PyUnicode_DecodeMBCS(s, size, NULL);
3259#elif defined(__APPLE__)
3260 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3261#else
Victor Stinner793b5312011-04-27 00:24:21 +02003262 PyInterpreterState *interp = PyThreadState_GET()->interp;
3263 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3264 cannot use it to encode and decode filenames before it is loaded. Load
3265 the Python codec requires to encode at least its own filename. Use the C
3266 version of the locale codec until the codec registry is initialized and
3267 the Python codec is loaded.
3268
3269 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3270 cannot only rely on it: check also interp->fscodec_initialized for
3271 subinterpreters. */
3272 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003273 return PyUnicode_Decode(s, size,
3274 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003275 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003276 }
3277 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003278 /* locale encoding with surrogateescape */
3279 wchar_t *wchar;
3280 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003281 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003282
3283 if (s[size] != '\0' || size != strlen(s)) {
3284 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3285 return NULL;
3286 }
3287
Victor Stinner168e1172010-10-16 23:16:16 +00003288 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003289 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003290 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003291
Victor Stinner168e1172010-10-16 23:16:16 +00003292 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003293 PyMem_Free(wchar);
3294 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003295 }
Victor Stinnerad158722010-10-27 00:25:46 +00003296#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003297}
3298
Martin v. Löwis011e8422009-05-05 04:43:17 +00003299
3300int
3301PyUnicode_FSConverter(PyObject* arg, void* addr)
3302{
3303 PyObject *output = NULL;
3304 Py_ssize_t size;
3305 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003306 if (arg == NULL) {
3307 Py_DECREF(*(PyObject**)addr);
3308 return 1;
3309 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003310 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003311 output = arg;
3312 Py_INCREF(output);
3313 }
3314 else {
3315 arg = PyUnicode_FromObject(arg);
3316 if (!arg)
3317 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003318 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003319 Py_DECREF(arg);
3320 if (!output)
3321 return 0;
3322 if (!PyBytes_Check(output)) {
3323 Py_DECREF(output);
3324 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3325 return 0;
3326 }
3327 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003328 size = PyBytes_GET_SIZE(output);
3329 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003330 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003331 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003332 Py_DECREF(output);
3333 return 0;
3334 }
3335 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003336 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003337}
3338
3339
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003340int
3341PyUnicode_FSDecoder(PyObject* arg, void* addr)
3342{
3343 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003344 if (arg == NULL) {
3345 Py_DECREF(*(PyObject**)addr);
3346 return 1;
3347 }
3348 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003349 if (PyUnicode_READY(arg))
3350 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003351 output = arg;
3352 Py_INCREF(output);
3353 }
3354 else {
3355 arg = PyBytes_FromObject(arg);
3356 if (!arg)
3357 return 0;
3358 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3359 PyBytes_GET_SIZE(arg));
3360 Py_DECREF(arg);
3361 if (!output)
3362 return 0;
3363 if (!PyUnicode_Check(output)) {
3364 Py_DECREF(output);
3365 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3366 return 0;
3367 }
3368 }
Victor Stinner065836e2011-10-27 01:56:33 +02003369 if (PyUnicode_READY(output) < 0) {
3370 Py_DECREF(output);
3371 return 0;
3372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003374 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003375 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3376 Py_DECREF(output);
3377 return 0;
3378 }
3379 *(PyObject**)addr = output;
3380 return Py_CLEANUP_SUPPORTED;
3381}
3382
3383
Martin v. Löwis5b222132007-06-10 09:51:05 +00003384char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003385PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003386{
Christian Heimesf3863112007-11-22 07:46:41 +00003387 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003389 if (!PyUnicode_Check(unicode)) {
3390 PyErr_BadArgument();
3391 return NULL;
3392 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003393 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003394 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003395
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003396 if (PyUnicode_UTF8(unicode) == NULL) {
3397 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3399 if (bytes == NULL)
3400 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003401 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3402 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 Py_DECREF(bytes);
3404 return NULL;
3405 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003406 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3407 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3408 PyBytes_AS_STRING(bytes),
3409 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003410 Py_DECREF(bytes);
3411 }
3412
3413 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003414 *psize = PyUnicode_UTF8_LENGTH(unicode);
3415 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003416}
3417
3418char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003419PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003420{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3422}
3423
3424#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003425static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426#endif
3427
3428
3429Py_UNICODE *
3430PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003432 const unsigned char *one_byte;
3433#if SIZEOF_WCHAR_T == 4
3434 const Py_UCS2 *two_bytes;
3435#else
3436 const Py_UCS4 *four_bytes;
3437 const Py_UCS4 *ucs4_end;
3438 Py_ssize_t num_surrogates;
3439#endif
3440 wchar_t *w;
3441 wchar_t *wchar_end;
3442
3443 if (!PyUnicode_Check(unicode)) {
3444 PyErr_BadArgument();
3445 return NULL;
3446 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003447 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003449 assert(_PyUnicode_KIND(unicode) != 0);
3450 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451
3452#ifdef Py_DEBUG
3453 ++unicode_as_unicode_calls;
3454#endif
3455
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003458 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3459 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460 num_surrogates = 0;
3461
3462 for (; four_bytes < ucs4_end; ++four_bytes) {
3463 if (*four_bytes > 0xFFFF)
3464 ++num_surrogates;
3465 }
3466
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003467 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3468 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3469 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 PyErr_NoMemory();
3471 return NULL;
3472 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003473 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003475 w = _PyUnicode_WSTR(unicode);
3476 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3477 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003478 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3479 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003480 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003481 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003482 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3483 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 }
3485 else
3486 *w = *four_bytes;
3487
3488 if (w > wchar_end) {
3489 assert(0 && "Miscalculated string end");
3490 }
3491 }
3492 *w = 0;
3493#else
3494 /* sizeof(wchar_t) == 4 */
3495 Py_FatalError("Impossible unicode object state, wstr and str "
3496 "should share memory already.");
3497 return NULL;
3498#endif
3499 }
3500 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3502 (_PyUnicode_LENGTH(unicode) + 1));
3503 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003504 PyErr_NoMemory();
3505 return NULL;
3506 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003507 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3508 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3509 w = _PyUnicode_WSTR(unicode);
3510 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003511
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003512 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3513 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514 for (; w < wchar_end; ++one_byte, ++w)
3515 *w = *one_byte;
3516 /* null-terminate the wstr */
3517 *w = 0;
3518 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003519 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003521 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003522 for (; w < wchar_end; ++two_bytes, ++w)
3523 *w = *two_bytes;
3524 /* null-terminate the wstr */
3525 *w = 0;
3526#else
3527 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003528 PyObject_FREE(_PyUnicode_WSTR(unicode));
3529 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003530 Py_FatalError("Impossible unicode object state, wstr "
3531 "and str should share memory already.");
3532 return NULL;
3533#endif
3534 }
3535 else {
3536 assert(0 && "This should never happen.");
3537 }
3538 }
3539 }
3540 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003541 *size = PyUnicode_WSTR_LENGTH(unicode);
3542 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003543}
3544
Alexander Belopolsky40018472011-02-26 01:02:56 +00003545Py_UNICODE *
3546PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003548 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549}
3550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551
Alexander Belopolsky40018472011-02-26 01:02:56 +00003552Py_ssize_t
3553PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554{
3555 if (!PyUnicode_Check(unicode)) {
3556 PyErr_BadArgument();
3557 goto onError;
3558 }
3559 return PyUnicode_GET_SIZE(unicode);
3560
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003562 return -1;
3563}
3564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565Py_ssize_t
3566PyUnicode_GetLength(PyObject *unicode)
3567{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003568 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569 PyErr_BadArgument();
3570 return -1;
3571 }
3572
3573 return PyUnicode_GET_LENGTH(unicode);
3574}
3575
3576Py_UCS4
3577PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3578{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003579 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3580 PyErr_BadArgument();
3581 return (Py_UCS4)-1;
3582 }
3583 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3584 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003585 return (Py_UCS4)-1;
3586 }
3587 return PyUnicode_READ_CHAR(unicode, index);
3588}
3589
3590int
3591PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3592{
3593 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003594 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003595 return -1;
3596 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003597 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3598 PyErr_SetString(PyExc_IndexError, "string index out of range");
3599 return -1;
3600 }
3601 if (_PyUnicode_Dirty(unicode))
3602 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3604 index, ch);
3605 return 0;
3606}
3607
Alexander Belopolsky40018472011-02-26 01:02:56 +00003608const char *
3609PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003610{
Victor Stinner42cb4622010-09-01 19:39:01 +00003611 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003612}
3613
Victor Stinner554f3f02010-06-16 23:33:54 +00003614/* create or adjust a UnicodeDecodeError */
3615static void
3616make_decode_exception(PyObject **exceptionObject,
3617 const char *encoding,
3618 const char *input, Py_ssize_t length,
3619 Py_ssize_t startpos, Py_ssize_t endpos,
3620 const char *reason)
3621{
3622 if (*exceptionObject == NULL) {
3623 *exceptionObject = PyUnicodeDecodeError_Create(
3624 encoding, input, length, startpos, endpos, reason);
3625 }
3626 else {
3627 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3628 goto onError;
3629 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3630 goto onError;
3631 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3632 goto onError;
3633 }
3634 return;
3635
3636onError:
3637 Py_DECREF(*exceptionObject);
3638 *exceptionObject = NULL;
3639}
3640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003641/* error handling callback helper:
3642 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003643 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 and adjust various state variables.
3645 return 0 on success, -1 on error
3646*/
3647
Alexander Belopolsky40018472011-02-26 01:02:56 +00003648static int
3649unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003650 const char *encoding, const char *reason,
3651 const char **input, const char **inend, Py_ssize_t *startinpos,
3652 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003653 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003655 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656
3657 PyObject *restuple = NULL;
3658 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003659 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003660 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003661 Py_ssize_t requiredsize;
3662 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003663 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003664 int res = -1;
3665
Victor Stinner596a6c42011-11-09 00:02:18 +01003666 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3667 outsize = PyUnicode_GET_LENGTH(*output);
3668 else
3669 outsize = _PyUnicode_WSTR_LENGTH(*output);
3670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 *errorHandler = PyCodec_LookupError(errors);
3673 if (*errorHandler == NULL)
3674 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 }
3676
Victor Stinner554f3f02010-06-16 23:33:54 +00003677 make_decode_exception(exceptionObject,
3678 encoding,
3679 *input, *inend - *input,
3680 *startinpos, *endinpos,
3681 reason);
3682 if (*exceptionObject == NULL)
3683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684
3685 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3686 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003689 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 }
3692 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003694 if (PyUnicode_READY(repunicode) < 0)
3695 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003696
3697 /* Copy back the bytes variables, which might have been modified by the
3698 callback */
3699 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3700 if (!inputobj)
3701 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003702 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003704 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003705 *input = PyBytes_AS_STRING(inputobj);
3706 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003707 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003708 /* we can DECREF safely, as the exception has another reference,
3709 so the object won't go away. */
3710 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003714 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003715 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3716 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718
Victor Stinner596a6c42011-11-09 00:02:18 +01003719 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3720 /* need more space? (at least enough for what we
3721 have+the replacement+the rest of the string (starting
3722 at the new input position), so we won't have to check space
3723 when there are no errors in the rest of the string) */
3724 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3725 requiredsize = *outpos + replen + insize-newpos;
3726 if (requiredsize > outsize) {
3727 if (requiredsize<2*outsize)
3728 requiredsize = 2*outsize;
3729 if (unicode_resize(output, requiredsize) < 0)
3730 goto onError;
3731 }
3732 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003733 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003734 copy_characters(*output, *outpos, repunicode, 0, replen);
3735 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003736 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003737 else {
3738 wchar_t *repwstr;
3739 Py_ssize_t repwlen;
3740 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3741 if (repwstr == NULL)
3742 goto onError;
3743 /* need more space? (at least enough for what we
3744 have+the replacement+the rest of the string (starting
3745 at the new input position), so we won't have to check space
3746 when there are no errors in the rest of the string) */
3747 requiredsize = *outpos + repwlen + insize-newpos;
3748 if (requiredsize > outsize) {
3749 if (requiredsize < 2*outsize)
3750 requiredsize = 2*outsize;
3751 if (unicode_resize(output, requiredsize) < 0)
3752 goto onError;
3753 }
3754 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3755 *outpos += repwlen;
3756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003757 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003758 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 /* we made it! */
3761 res = 0;
3762
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 Py_XDECREF(restuple);
3765 return res;
3766}
3767
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003768/* --- UTF-7 Codec -------------------------------------------------------- */
3769
Antoine Pitrou244651a2009-05-04 18:56:13 +00003770/* See RFC2152 for details. We encode conservatively and decode liberally. */
3771
3772/* Three simple macros defining base-64. */
3773
3774/* Is c a base-64 character? */
3775
3776#define IS_BASE64(c) \
3777 (((c) >= 'A' && (c) <= 'Z') || \
3778 ((c) >= 'a' && (c) <= 'z') || \
3779 ((c) >= '0' && (c) <= '9') || \
3780 (c) == '+' || (c) == '/')
3781
3782/* given that c is a base-64 character, what is its base-64 value? */
3783
3784#define FROM_BASE64(c) \
3785 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3786 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3787 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3788 (c) == '+' ? 62 : 63)
3789
3790/* What is the base-64 character of the bottom 6 bits of n? */
3791
3792#define TO_BASE64(n) \
3793 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3794
3795/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3796 * decoded as itself. We are permissive on decoding; the only ASCII
3797 * byte not decoding to itself is the + which begins a base64
3798 * string. */
3799
3800#define DECODE_DIRECT(c) \
3801 ((c) <= 127 && (c) != '+')
3802
3803/* The UTF-7 encoder treats ASCII characters differently according to
3804 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3805 * the above). See RFC2152. This array identifies these different
3806 * sets:
3807 * 0 : "Set D"
3808 * alphanumeric and '(),-./:?
3809 * 1 : "Set O"
3810 * !"#$%&*;<=>@[]^_`{|}
3811 * 2 : "whitespace"
3812 * ht nl cr sp
3813 * 3 : special (must be base64 encoded)
3814 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3815 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816
Tim Petersced69f82003-09-16 20:30:58 +00003817static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818char utf7_category[128] = {
3819/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3820 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3821/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3822 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3823/* sp ! " # $ % & ' ( ) * + , - . / */
3824 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3825/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3827/* @ A B C D E F G H I J K L M N O */
3828 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3829/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3831/* ` a b c d e f g h i j k l m n o */
3832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3833/* p q r s t u v w x y z { | } ~ del */
3834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003835};
3836
Antoine Pitrou244651a2009-05-04 18:56:13 +00003837/* ENCODE_DIRECT: this character should be encoded as itself. The
3838 * answer depends on whether we are encoding set O as itself, and also
3839 * on whether we are encoding whitespace as itself. RFC2152 makes it
3840 * clear that the answers to these questions vary between
3841 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003842
Antoine Pitrou244651a2009-05-04 18:56:13 +00003843#define ENCODE_DIRECT(c, directO, directWS) \
3844 ((c) < 128 && (c) > 0 && \
3845 ((utf7_category[(c)] == 0) || \
3846 (directWS && (utf7_category[(c)] == 2)) || \
3847 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003848
Alexander Belopolsky40018472011-02-26 01:02:56 +00003849PyObject *
3850PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003851 Py_ssize_t size,
3852 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003853{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003854 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3855}
3856
Antoine Pitrou244651a2009-05-04 18:56:13 +00003857/* The decoder. The only state we preserve is our read position,
3858 * i.e. how many characters we have consumed. So if we end in the
3859 * middle of a shift sequence we have to back off the read position
3860 * and the output to the beginning of the sequence, otherwise we lose
3861 * all the shift state (seen bits, number of bits seen, high
3862 * surrogate). */
3863
Alexander Belopolsky40018472011-02-26 01:02:56 +00003864PyObject *
3865PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003866 Py_ssize_t size,
3867 const char *errors,
3868 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003871 Py_ssize_t startinpos;
3872 Py_ssize_t endinpos;
3873 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003875 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003876 const char *errmsg = "";
3877 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003878 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003879 unsigned int base64bits = 0;
3880 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003881 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 PyObject *errorHandler = NULL;
3883 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003885 /* Start off assuming it's all ASCII. Widen later as necessary. */
3886 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003887 if (!unicode)
3888 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003889 if (size == 0) {
3890 if (consumed)
3891 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003892 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003893 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003895 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896 e = s + size;
3897
3898 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003900 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003901 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003902
Antoine Pitrou244651a2009-05-04 18:56:13 +00003903 if (inShift) { /* in a base-64 section */
3904 if (IS_BASE64(ch)) { /* consume a base-64 character */
3905 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3906 base64bits += 6;
3907 s++;
3908 if (base64bits >= 16) {
3909 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003910 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003911 base64bits -= 16;
3912 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3913 if (surrogate) {
3914 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003915 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3916 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003917 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3918 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003919 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003920 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 }
3922 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003923 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3924 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003925 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 }
3927 }
Victor Stinner551ac952011-11-29 22:58:13 +01003928 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003929 /* first surrogate */
3930 surrogate = outCh;
3931 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003932 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003933 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3934 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003935 }
3936 }
3937 }
3938 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003939 inShift = 0;
3940 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003942 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3943 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003944 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003945 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003946 if (base64bits > 0) { /* left-over bits */
3947 if (base64bits >= 6) {
3948 /* We've seen at least one base-64 character */
3949 errmsg = "partial character in shift sequence";
3950 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003951 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003952 else {
3953 /* Some bits remain; they should be zero */
3954 if (base64buffer != 0) {
3955 errmsg = "non-zero padding bits in shift sequence";
3956 goto utf7Error;
3957 }
3958 }
3959 }
3960 if (ch != '-') {
3961 /* '-' is absorbed; other terminating
3962 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003963 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3964 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003965 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003966 }
3967 }
3968 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 s++; /* consume '+' */
3971 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003972 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3974 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975 }
3976 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003979 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 }
3981 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003982 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003983 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3984 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985 s++;
3986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else {
3988 startinpos = s-starts;
3989 s++;
3990 errmsg = "unexpected special character";
3991 goto utf7Error;
3992 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003994utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 endinpos = s-starts;
3996 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 errors, &errorHandler,
3998 "utf7", errmsg,
3999 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004000 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002 }
4003
Antoine Pitrou244651a2009-05-04 18:56:13 +00004004 /* end of string */
4005
4006 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4007 /* if we're in an inconsistent state, that's an error */
4008 if (surrogate ||
4009 (base64bits >= 6) ||
4010 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004011 endinpos = size;
4012 if (unicode_decode_call_errorhandler(
4013 errors, &errorHandler,
4014 "utf7", "unterminated shift sequence",
4015 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004016 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004017 goto onError;
4018 if (s < e)
4019 goto restart;
4020 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004021 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004022
4023 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004024 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004025 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004026 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004028 }
4029 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004030 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004031 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004032 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004033
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004034 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035 goto onError;
4036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_XDECREF(errorHandler);
4038 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004039 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(errorHandler);
4043 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004044 Py_DECREF(unicode);
4045 return NULL;
4046}
4047
4048
Alexander Belopolsky40018472011-02-26 01:02:56 +00004049PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004050_PyUnicode_EncodeUTF7(PyObject *str,
4051 int base64SetO,
4052 int base64WhiteSpace,
4053 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004054{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055 int kind;
4056 void *data;
4057 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004058 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004059 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004060 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004061 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004062 unsigned int base64bits = 0;
4063 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004064 char * out;
4065 char * start;
4066
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004067 if (PyUnicode_READY(str) < 0)
4068 return NULL;
4069 kind = PyUnicode_KIND(str);
4070 data = PyUnicode_DATA(str);
4071 len = PyUnicode_GET_LENGTH(str);
4072
4073 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004074 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004075
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004076 /* It might be possible to tighten this worst case */
4077 allocated = 8 * len;
4078 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004079 return PyErr_NoMemory();
4080
Antoine Pitrou244651a2009-05-04 18:56:13 +00004081 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004082 if (v == NULL)
4083 return NULL;
4084
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004085 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004086 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004087 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004088
Antoine Pitrou244651a2009-05-04 18:56:13 +00004089 if (inShift) {
4090 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4091 /* shifting out */
4092 if (base64bits) { /* output remaining bits */
4093 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4094 base64buffer = 0;
4095 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004096 }
4097 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 /* Characters not in the BASE64 set implicitly unshift the sequence
4099 so no '-' is required, except if the character is itself a '-' */
4100 if (IS_BASE64(ch) || ch == '-') {
4101 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004102 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004103 *out++ = (char) ch;
4104 }
4105 else {
4106 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004107 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004108 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004109 else { /* not in a shift sequence */
4110 if (ch == '+') {
4111 *out++ = '+';
4112 *out++ = '-';
4113 }
4114 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4115 *out++ = (char) ch;
4116 }
4117 else {
4118 *out++ = '+';
4119 inShift = 1;
4120 goto encode_char;
4121 }
4122 }
4123 continue;
4124encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004125 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004126 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004127
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128 /* code first surrogate */
4129 base64bits += 16;
4130 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4131 while (base64bits >= 6) {
4132 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4133 base64bits -= 6;
4134 }
4135 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004136 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004137 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138 base64bits += 16;
4139 base64buffer = (base64buffer << 16) | ch;
4140 while (base64bits >= 6) {
4141 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4142 base64bits -= 6;
4143 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004144 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004145 if (base64bits)
4146 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4147 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004148 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004149 if (_PyBytes_Resize(&v, out - start) < 0)
4150 return NULL;
4151 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004153PyObject *
4154PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4155 Py_ssize_t size,
4156 int base64SetO,
4157 int base64WhiteSpace,
4158 const char *errors)
4159{
4160 PyObject *result;
4161 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4162 if (tmp == NULL)
4163 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004164 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004165 base64WhiteSpace, errors);
4166 Py_DECREF(tmp);
4167 return result;
4168}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170#undef IS_BASE64
4171#undef FROM_BASE64
4172#undef TO_BASE64
4173#undef DECODE_DIRECT
4174#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176/* --- UTF-8 Codec -------------------------------------------------------- */
4177
Tim Petersced69f82003-09-16 20:30:58 +00004178static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004180 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4181 illegal prefix. See RFC 3629 for details */
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4194 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4195 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4197 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198};
4199
Alexander Belopolsky40018472011-02-26 01:02:56 +00004200PyObject *
4201PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004202 Py_ssize_t size,
4203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204{
Walter Dörwald69652032004-09-07 20:24:22 +00004205 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4206}
4207
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004208#include "stringlib/ucs1lib.h"
4209#include "stringlib/codecs.h"
4210#include "stringlib/undef.h"
4211
4212#include "stringlib/ucs2lib.h"
4213#include "stringlib/codecs.h"
4214#include "stringlib/undef.h"
4215
4216#include "stringlib/ucs4lib.h"
4217#include "stringlib/codecs.h"
4218#include "stringlib/undef.h"
4219
Antoine Pitrouab868312009-01-10 15:40:25 +00004220/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4221#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4222
4223/* Mask to quickly check whether a C 'long' contains a
4224 non-ASCII, UTF8-encoded char. */
4225#if (SIZEOF_LONG == 8)
4226# define ASCII_CHAR_MASK 0x8080808080808080L
4227#elif (SIZEOF_LONG == 4)
4228# define ASCII_CHAR_MASK 0x80808080L
4229#else
4230# error C 'long' size should be either 4 or 8!
4231#endif
4232
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004233/* Scans a UTF-8 string and returns the maximum character to be expected
4234 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 */
4239static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004240utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4241 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244 const unsigned char *p = (const unsigned char *)s;
4245 const unsigned char *end = p + string_size;
4246 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004248 assert(unicode_size != NULL);
4249
4250 /* By having a cascade of independent loops which fallback onto each
4251 other, we minimize the amount of work done in the average loop
4252 iteration, and we also maximize the CPU's ability to predict
4253 branches correctly (because a given condition will have always the
4254 same boolean outcome except perhaps in the last iteration of the
4255 corresponding loop).
4256 In the general case this brings us rather close to decoding
4257 performance pre-PEP 393, despite the two-pass decoding.
4258
4259 Note that the pure ASCII loop is not duplicated once a non-ASCII
4260 character has been encountered. It is actually a pessimization (by
4261 a significant factor) to use this loop on text with many non-ASCII
4262 characters, and it is important to avoid bad performance on valid
4263 utf-8 data (invalid utf-8 being a different can of worms).
4264 */
4265
4266 /* ASCII */
4267 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 /* Only check value if it's not a ASCII char... */
4269 if (*p < 0x80) {
4270 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4271 an explanation. */
4272 if (!((size_t) p & LONG_PTR_MASK)) {
4273 /* Help register allocation */
4274 register const unsigned char *_p = p;
4275 while (_p < aligned_end) {
4276 unsigned long value = *(unsigned long *) _p;
4277 if (value & ASCII_CHAR_MASK)
4278 break;
4279 _p += SIZEOF_LONG;
4280 char_count += SIZEOF_LONG;
4281 }
4282 p = _p;
4283 if (p == end)
4284 break;
4285 }
4286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 if (*p < 0x80)
4288 ++char_count;
4289 else
4290 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004292 *unicode_size = char_count;
4293 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295_ucs1loop:
4296 for (; p < end; ++p) {
4297 if (*p < 0xc4)
4298 char_count += ((*p & 0xc0) != 0x80);
4299 else
4300 goto _ucs2loop;
4301 }
4302 *unicode_size = char_count;
4303 return 255;
4304
4305_ucs2loop:
4306 for (; p < end; ++p) {
4307 if (*p < 0xf0)
4308 char_count += ((*p & 0xc0) != 0x80);
4309 else
4310 goto _ucs4loop;
4311 }
4312 *unicode_size = char_count;
4313 return 65535;
4314
4315_ucs4loop:
4316 for (; p < end; ++p) {
4317 char_count += ((*p & 0xc0) != 0x80);
4318 }
4319 *unicode_size = char_count;
4320 return 65537;
4321}
4322
4323/* Called when we encountered some error that wasn't detected in the original
4324 scan, e.g. an encoded surrogate character. The original maxchar computation
4325 may have been incorrect, so redo it. */
4326static int
4327refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4328{
4329 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004330 Py_ssize_t k;
4331 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004332 for (k = 0, maxchar = 0; k < n; k++)
4333 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4334 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4335 if (tmp == NULL)
4336 return -1;
4337 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4338 Py_DECREF(*unicode);
4339 *unicode = tmp;
4340 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341}
4342
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004343/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4344 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4345 onError. Potential resizing overallocates, so the result needs to shrink
4346 at the end.
4347*/
4348#define WRITE_MAYBE_FAIL(index, value) \
4349 do { \
4350 if (has_errors) { \
4351 Py_ssize_t pos = index; \
4352 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4353 unicode_resize(&unicode, pos + pos/8) < 0) \
4354 goto onError; \
4355 if (unicode_putchar(&unicode, &pos, value) < 0) \
4356 goto onError; \
4357 } \
4358 else \
4359 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 } while (0)
4361
Alexander Belopolsky40018472011-02-26 01:02:56 +00004362PyObject *
4363PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 Py_ssize_t size,
4365 const char *errors,
4366 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004367{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004370 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 Py_ssize_t startinpos;
4372 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004373 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004374 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004375 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004376 PyObject *errorHandler = NULL;
4377 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004378 Py_UCS4 maxchar = 0;
4379 Py_ssize_t unicode_size;
4380 Py_ssize_t i;
4381 int kind;
4382 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004383 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384
Walter Dörwald69652032004-09-07 20:24:22 +00004385 if (size == 0) {
4386 if (consumed)
4387 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004388 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004389 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004390 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004391 /* When the string is ASCII only, just use memcpy and return.
4392 unicode_size may be != size if there is an incomplete UTF-8
4393 sequence at the end of the ASCII block. */
4394 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004395 if (consumed)
4396 *consumed = size;
4397
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004398 if (size == 1)
4399 return get_latin1_char((unsigned char)s[0]);
4400
4401 unicode = PyUnicode_New(unicode_size, maxchar);
4402 if (!unicode)
4403 return NULL;
4404 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4405 assert(_PyUnicode_CheckConsistency(unicode, 1));
4406 return unicode;
4407 }
4408
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004409 /* In case of errors, maxchar and size computation might be incorrect;
4410 code below refits and resizes as necessary. */
4411 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 if (!unicode)
4413 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 kind = PyUnicode_KIND(unicode);
4415 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004416
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004418 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004420 switch (kind) {
4421 case PyUnicode_1BYTE_KIND:
4422 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4423 break;
4424 case PyUnicode_2BYTE_KIND:
4425 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4426 break;
4427 case PyUnicode_4BYTE_KIND:
4428 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4429 break;
4430 }
4431 if (!has_errors) {
4432 /* Ensure the unicode size calculation was correct */
4433 assert(i == unicode_size);
4434 assert(s == e);
4435 if (consumed)
4436 *consumed = s-starts;
4437 return unicode;
4438 }
4439 /* Fall through to the generic decoding loop for the rest of
4440 the string */
4441 if (refit_partial_string(&unicode, kind, data, i) < 0)
4442 goto onError;
4443
Antoine Pitrouab868312009-01-10 15:40:25 +00004444 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
4446 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004447 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448
4449 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004450 /* Fast path for runs of ASCII characters. Given that common UTF-8
4451 input will consist of an overwhelming majority of ASCII
4452 characters, we try to optimize for this case by checking
4453 as many characters as a C 'long' can contain.
4454 First, check if we can do an aligned read, as most CPUs have
4455 a penalty for unaligned reads.
4456 */
4457 if (!((size_t) s & LONG_PTR_MASK)) {
4458 /* Help register allocation */
4459 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004461 while (_s < aligned_end) {
4462 /* Read a whole long at a time (either 4 or 8 bytes),
4463 and do a fast unrolled copy if it only contains ASCII
4464 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004465 unsigned long value = *(unsigned long *) _s;
4466 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004467 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4469 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4470 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4471 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004472#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4474 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4475 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4476 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004477#endif
4478 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004479 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004480 }
4481 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004483 if (s == e)
4484 break;
4485 ch = (unsigned char)*s;
4486 }
4487 }
4488
4489 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004490 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 s++;
4492 continue;
4493 }
4494
4495 n = utf8_code_length[ch];
4496
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004497 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 if (consumed)
4499 break;
4500 else {
4501 errmsg = "unexpected end of data";
4502 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004503 endinpos = startinpos+1;
4504 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4505 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 goto utf8Error;
4507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004508 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509
4510 switch (n) {
4511
4512 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004513 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 startinpos = s-starts;
4515 endinpos = startinpos+1;
4516 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517
4518 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004519 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 startinpos = s-starts;
4521 endinpos = startinpos+1;
4522 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523
4524 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004525 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004526 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004528 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 goto utf8Error;
4530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004532 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004533 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 break;
4535
4536 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004537 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4538 will result in surrogates in range d800-dfff. Surrogates are
4539 not valid UTF-8 so they are rejected.
4540 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4541 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004542 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004543 (s[2] & 0xc0) != 0x80 ||
4544 ((unsigned char)s[0] == 0xE0 &&
4545 (unsigned char)s[1] < 0xA0) ||
4546 ((unsigned char)s[0] == 0xED &&
4547 (unsigned char)s[1] > 0x9F)) {
4548 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004550 endinpos = startinpos + 1;
4551
4552 /* if s[1] first two bits are 1 and 0, then the invalid
4553 continuation byte is s[2], so increment endinpos by 1,
4554 if not, s[1] is invalid and endinpos doesn't need to
4555 be incremented. */
4556 if ((s[1] & 0xC0) == 0x80)
4557 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 goto utf8Error;
4559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004560 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004561 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004562 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004563 break;
4564
4565 case 4:
4566 if ((s[1] & 0xc0) != 0x80 ||
4567 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004568 (s[3] & 0xc0) != 0x80 ||
4569 ((unsigned char)s[0] == 0xF0 &&
4570 (unsigned char)s[1] < 0x90) ||
4571 ((unsigned char)s[0] == 0xF4 &&
4572 (unsigned char)s[1] > 0x8F)) {
4573 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004574 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004575 endinpos = startinpos + 1;
4576 if ((s[1] & 0xC0) == 0x80) {
4577 endinpos++;
4578 if ((s[2] & 0xC0) == 0x80)
4579 endinpos++;
4580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 goto utf8Error;
4582 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004583 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004584 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004585 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004586
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004587 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 }
4590 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004592
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004594 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004595 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004596 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004598 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 if (unicode_decode_call_errorhandler(
4600 errors, &errorHandler,
4601 "utf8", errmsg,
4602 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004603 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004605 /* Update data because unicode_decode_call_errorhandler might have
4606 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004607 data = PyUnicode_DATA(unicode);
4608 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004611 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004612 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613
Walter Dörwald69652032004-09-07 20:24:22 +00004614 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004617 /* Adjust length and ready string when it contained errors and
4618 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004619 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004620 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004621 goto onError;
4622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 Py_XDECREF(errorHandler);
4625 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004626 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004627 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004630 Py_XDECREF(errorHandler);
4631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632 Py_DECREF(unicode);
4633 return NULL;
4634}
4635
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004636#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004637
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004638#ifdef __APPLE__
4639
4640/* Simplified UTF-8 decoder using surrogateescape error handler,
4641 used to decode the command line arguments on Mac OS X. */
4642
4643wchar_t*
4644_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4645{
4646 int n;
4647 const char *e;
4648 wchar_t *unicode, *p;
4649
4650 /* Note: size will always be longer than the resulting Unicode
4651 character count */
4652 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4653 PyErr_NoMemory();
4654 return NULL;
4655 }
4656 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4657 if (!unicode)
4658 return NULL;
4659
4660 /* Unpack UTF-8 encoded data */
4661 p = unicode;
4662 e = s + size;
4663 while (s < e) {
4664 Py_UCS4 ch = (unsigned char)*s;
4665
4666 if (ch < 0x80) {
4667 *p++ = (wchar_t)ch;
4668 s++;
4669 continue;
4670 }
4671
4672 n = utf8_code_length[ch];
4673 if (s + n > e) {
4674 goto surrogateescape;
4675 }
4676
4677 switch (n) {
4678 case 0:
4679 case 1:
4680 goto surrogateescape;
4681
4682 case 2:
4683 if ((s[1] & 0xc0) != 0x80)
4684 goto surrogateescape;
4685 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4686 assert ((ch > 0x007F) && (ch <= 0x07FF));
4687 *p++ = (wchar_t)ch;
4688 break;
4689
4690 case 3:
4691 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4692 will result in surrogates in range d800-dfff. Surrogates are
4693 not valid UTF-8 so they are rejected.
4694 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4695 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4696 if ((s[1] & 0xc0) != 0x80 ||
4697 (s[2] & 0xc0) != 0x80 ||
4698 ((unsigned char)s[0] == 0xE0 &&
4699 (unsigned char)s[1] < 0xA0) ||
4700 ((unsigned char)s[0] == 0xED &&
4701 (unsigned char)s[1] > 0x9F)) {
4702
4703 goto surrogateescape;
4704 }
4705 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4706 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004707 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004708 break;
4709
4710 case 4:
4711 if ((s[1] & 0xc0) != 0x80 ||
4712 (s[2] & 0xc0) != 0x80 ||
4713 (s[3] & 0xc0) != 0x80 ||
4714 ((unsigned char)s[0] == 0xF0 &&
4715 (unsigned char)s[1] < 0x90) ||
4716 ((unsigned char)s[0] == 0xF4 &&
4717 (unsigned char)s[1] > 0x8F)) {
4718 goto surrogateescape;
4719 }
4720 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4721 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004722 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723
4724#if SIZEOF_WCHAR_T == 4
4725 *p++ = (wchar_t)ch;
4726#else
4727 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004728 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4729 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004730#endif
4731 break;
4732 }
4733 s += n;
4734 continue;
4735
4736 surrogateescape:
4737 *p++ = 0xDC00 + ch;
4738 s++;
4739 }
4740 *p = L'\0';
4741 return unicode;
4742}
4743
4744#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004746/* Primary internal function which creates utf8 encoded bytes objects.
4747
4748 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004749 and allocate exactly as much space needed at the end. Else allocate the
4750 maximum possible needed (4 result bytes per Unicode character), and return
4751 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004752*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004753PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004754_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
Tim Peters602f7402002-04-27 18:03:26 +00004756#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004757
Guido van Rossum98297ee2007-11-06 21:34:58 +00004758 Py_ssize_t i; /* index into s of next input byte */
4759 PyObject *result; /* result string object */
4760 char *p; /* next free byte in output buffer */
4761 Py_ssize_t nallocated; /* number of result bytes allocated */
4762 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004763 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004764 PyObject *errorHandler = NULL;
4765 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 int kind;
4767 void *data;
4768 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004769 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004771 if (!PyUnicode_Check(unicode)) {
4772 PyErr_BadArgument();
4773 return NULL;
4774 }
4775
4776 if (PyUnicode_READY(unicode) == -1)
4777 return NULL;
4778
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004779 if (PyUnicode_UTF8(unicode))
4780 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4781 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782
4783 kind = PyUnicode_KIND(unicode);
4784 data = PyUnicode_DATA(unicode);
4785 size = PyUnicode_GET_LENGTH(unicode);
4786
Tim Peters602f7402002-04-27 18:03:26 +00004787 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788
Tim Peters602f7402002-04-27 18:03:26 +00004789 if (size <= MAX_SHORT_UNICHARS) {
4790 /* Write into the stack buffer; nallocated can't overflow.
4791 * At the end, we'll allocate exactly as much heap space as it
4792 * turns out we need.
4793 */
4794 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004795 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004796 p = stackbuf;
4797 }
4798 else {
4799 /* Overallocate on the heap, and give the excess back at the end. */
4800 nallocated = size * 4;
4801 if (nallocated / 4 != size) /* overflow! */
4802 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004803 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004804 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004805 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004806 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004807 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004808
Tim Peters602f7402002-04-27 18:03:26 +00004809 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004811
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004812 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004813 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004817 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004818 *p++ = (char)(0xc0 | (ch >> 6));
4819 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004820 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004821 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822 Py_ssize_t repsize, k, startpos;
4823 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824 rep = unicode_encode_call_errorhandler(
4825 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004826 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 if (!rep)
4828 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 if (PyBytes_Check(rep))
4831 repsize = PyBytes_GET_SIZE(rep);
4832 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004833 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834
4835 if (repsize > 4) {
4836 Py_ssize_t offset;
4837
4838 if (result == NULL)
4839 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004840 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004841 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4844 /* integer overflow */
4845 PyErr_NoMemory();
4846 goto error;
4847 }
4848 nallocated += repsize - 4;
4849 if (result != NULL) {
4850 if (_PyBytes_Resize(&result, nallocated) < 0)
4851 goto error;
4852 } else {
4853 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004854 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855 goto error;
4856 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4857 }
4858 p = PyBytes_AS_STRING(result) + offset;
4859 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861 if (PyBytes_Check(rep)) {
4862 char *prep = PyBytes_AS_STRING(rep);
4863 for(k = repsize; k > 0; k--)
4864 *p++ = *prep++;
4865 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004866 enum PyUnicode_Kind repkind;
4867 void *repdata;
4868
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004869 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004870 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004871 repkind = PyUnicode_KIND(rep);
4872 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873
4874 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004875 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004877 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004878 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004879 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004881 goto error;
4882 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004883 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004884 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004885 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004886 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004887 } else if (ch < 0x10000) {
4888 *p++ = (char)(0xe0 | (ch >> 12));
4889 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4890 *p++ = (char)(0x80 | (ch & 0x3f));
4891 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004892 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004893 /* Encode UCS4 Unicode ordinals */
4894 *p++ = (char)(0xf0 | (ch >> 18));
4895 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4896 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4897 *p++ = (char)(0x80 | (ch & 0x3f));
4898 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004900
Guido van Rossum98297ee2007-11-06 21:34:58 +00004901 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004902 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004903 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004904 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004905 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004906 }
4907 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004908 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004909 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004910 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004911 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004913
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004914 Py_XDECREF(errorHandler);
4915 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004916 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004917 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004918 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004919 Py_XDECREF(errorHandler);
4920 Py_XDECREF(exc);
4921 Py_XDECREF(result);
4922 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004923
Tim Peters602f7402002-04-27 18:03:26 +00004924#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925}
4926
Alexander Belopolsky40018472011-02-26 01:02:56 +00004927PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004928PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4929 Py_ssize_t size,
4930 const char *errors)
4931{
4932 PyObject *v, *unicode;
4933
4934 unicode = PyUnicode_FromUnicode(s, size);
4935 if (unicode == NULL)
4936 return NULL;
4937 v = _PyUnicode_AsUTF8String(unicode, errors);
4938 Py_DECREF(unicode);
4939 return v;
4940}
4941
4942PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004943PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004945 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946}
4947
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948/* --- UTF-32 Codec ------------------------------------------------------- */
4949
4950PyObject *
4951PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 Py_ssize_t size,
4953 const char *errors,
4954 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955{
4956 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4957}
4958
4959PyObject *
4960PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 Py_ssize_t size,
4962 const char *errors,
4963 int *byteorder,
4964 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965{
4966 const char *starts = s;
4967 Py_ssize_t startinpos;
4968 Py_ssize_t endinpos;
4969 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004970 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004971 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 int bo = 0; /* assume native ordering by default */
4973 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 /* Offsets from q for retrieving bytes in the right order. */
4975#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4976 int iorder[] = {0, 1, 2, 3};
4977#else
4978 int iorder[] = {3, 2, 1, 0};
4979#endif
4980 PyObject *errorHandler = NULL;
4981 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004982
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983 q = (unsigned char *)s;
4984 e = q + size;
4985
4986 if (byteorder)
4987 bo = *byteorder;
4988
4989 /* Check for BOM marks (U+FEFF) in the input and adjust current
4990 byte order setting accordingly. In native mode, the leading BOM
4991 mark is skipped, in all other modes, it is copied to the output
4992 stream as-is (giving a ZWNBSP character). */
4993 if (bo == 0) {
4994 if (size >= 4) {
4995 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 if (bom == 0x0000FEFF) {
4999 q += 4;
5000 bo = -1;
5001 }
5002 else if (bom == 0xFFFE0000) {
5003 q += 4;
5004 bo = 1;
5005 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 if (bom == 0x0000FEFF) {
5008 q += 4;
5009 bo = 1;
5010 }
5011 else if (bom == 0xFFFE0000) {
5012 q += 4;
5013 bo = -1;
5014 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017 }
5018
5019 if (bo == -1) {
5020 /* force LE */
5021 iorder[0] = 0;
5022 iorder[1] = 1;
5023 iorder[2] = 2;
5024 iorder[3] = 3;
5025 }
5026 else if (bo == 1) {
5027 /* force BE */
5028 iorder[0] = 3;
5029 iorder[1] = 2;
5030 iorder[2] = 1;
5031 iorder[3] = 0;
5032 }
5033
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005034 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005035 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005036 if (!unicode)
5037 return NULL;
5038 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005039 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005040 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005041
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 Py_UCS4 ch;
5044 /* remaining bytes at the end? (size should be divisible by 4) */
5045 if (e-q<4) {
5046 if (consumed)
5047 break;
5048 errmsg = "truncated data";
5049 startinpos = ((const char *)q)-starts;
5050 endinpos = ((const char *)e)-starts;
5051 goto utf32Error;
5052 /* The remaining input chars are ignored if the callback
5053 chooses to skip the input */
5054 }
5055 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5056 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 if (ch >= 0x110000)
5059 {
5060 errmsg = "codepoint not in range(0x110000)";
5061 startinpos = ((const char *)q)-starts;
5062 endinpos = startinpos+4;
5063 goto utf32Error;
5064 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005065 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5066 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 q += 4;
5068 continue;
5069 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 if (unicode_decode_call_errorhandler(
5071 errors, &errorHandler,
5072 "utf32", errmsg,
5073 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005074 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076 }
5077
5078 if (byteorder)
5079 *byteorder = bo;
5080
5081 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083
5084 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005085 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 goto onError;
5087
5088 Py_XDECREF(errorHandler);
5089 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005090 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093 Py_DECREF(unicode);
5094 Py_XDECREF(errorHandler);
5095 Py_XDECREF(exc);
5096 return NULL;
5097}
5098
5099PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100_PyUnicode_EncodeUTF32(PyObject *str,
5101 const char *errors,
5102 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005104 int kind;
5105 void *data;
5106 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005107 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005109 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 /* Offsets from p for storing byte pairs in the right order. */
5111#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5112 int iorder[] = {0, 1, 2, 3};
5113#else
5114 int iorder[] = {3, 2, 1, 0};
5115#endif
5116
Benjamin Peterson29060642009-01-31 22:14:21 +00005117#define STORECHAR(CH) \
5118 do { \
5119 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5120 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5121 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5122 p[iorder[0]] = (CH) & 0xff; \
5123 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005124 } while(0)
5125
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005126 if (!PyUnicode_Check(str)) {
5127 PyErr_BadArgument();
5128 return NULL;
5129 }
5130 if (PyUnicode_READY(str) < 0)
5131 return NULL;
5132 kind = PyUnicode_KIND(str);
5133 data = PyUnicode_DATA(str);
5134 len = PyUnicode_GET_LENGTH(str);
5135
5136 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005137 bytesize = nsize * 4;
5138 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005140 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 if (v == NULL)
5142 return NULL;
5143
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005144 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005147 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005148 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005149
5150 if (byteorder == -1) {
5151 /* force LE */
5152 iorder[0] = 0;
5153 iorder[1] = 1;
5154 iorder[2] = 2;
5155 iorder[3] = 3;
5156 }
5157 else if (byteorder == 1) {
5158 /* force BE */
5159 iorder[0] = 3;
5160 iorder[1] = 2;
5161 iorder[2] = 1;
5162 iorder[3] = 0;
5163 }
5164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005165 for (i = 0; i < len; i++)
5166 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005167
5168 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005169 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005170#undef STORECHAR
5171}
5172
Alexander Belopolsky40018472011-02-26 01:02:56 +00005173PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005174PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5175 Py_ssize_t size,
5176 const char *errors,
5177 int byteorder)
5178{
5179 PyObject *result;
5180 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5181 if (tmp == NULL)
5182 return NULL;
5183 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5184 Py_DECREF(tmp);
5185 return result;
5186}
5187
5188PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005189PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005190{
Victor Stinnerb960b342011-11-20 19:12:52 +01005191 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192}
5193
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194/* --- UTF-16 Codec ------------------------------------------------------- */
5195
Tim Peters772747b2001-08-09 22:21:55 +00005196PyObject *
5197PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005198 Py_ssize_t size,
5199 const char *errors,
5200 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201{
Walter Dörwald69652032004-09-07 20:24:22 +00005202 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5203}
5204
Antoine Pitrouab868312009-01-10 15:40:25 +00005205/* Two masks for fast checking of whether a C 'long' may contain
5206 UTF16-encoded surrogate characters. This is an efficient heuristic,
5207 assuming that non-surrogate characters with a code point >= 0x8000 are
5208 rare in most input.
5209 FAST_CHAR_MASK is used when the input is in native byte ordering,
5210 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005211*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005212#if (SIZEOF_LONG == 8)
5213# define FAST_CHAR_MASK 0x8000800080008000L
5214# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5215#elif (SIZEOF_LONG == 4)
5216# define FAST_CHAR_MASK 0x80008000L
5217# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5218#else
5219# error C 'long' size should be either 4 or 8!
5220#endif
5221
Walter Dörwald69652032004-09-07 20:24:22 +00005222PyObject *
5223PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 Py_ssize_t size,
5225 const char *errors,
5226 int *byteorder,
5227 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005230 Py_ssize_t startinpos;
5231 Py_ssize_t endinpos;
5232 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005233 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005234 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005235 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005236 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005237 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005238 /* Offsets from q for retrieving byte pairs in the right order. */
5239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5240 int ihi = 1, ilo = 0;
5241#else
5242 int ihi = 0, ilo = 1;
5243#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 PyObject *errorHandler = NULL;
5245 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246
5247 /* Note: size will always be longer than the resulting Unicode
5248 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005249 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 if (!unicode)
5251 return NULL;
5252 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005253 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005254 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Tim Peters772747b2001-08-09 22:21:55 +00005256 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005257 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
5259 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005260 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005262 /* Check for BOM marks (U+FEFF) in the input and adjust current
5263 byte order setting accordingly. In native mode, the leading BOM
5264 mark is skipped, in all other modes, it is copied to the output
5265 stream as-is (giving a ZWNBSP character). */
5266 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005267 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005268 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005269#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 if (bom == 0xFEFF) {
5271 q += 2;
5272 bo = -1;
5273 }
5274 else if (bom == 0xFFFE) {
5275 q += 2;
5276 bo = 1;
5277 }
Tim Petersced69f82003-09-16 20:30:58 +00005278#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 if (bom == 0xFEFF) {
5280 q += 2;
5281 bo = 1;
5282 }
5283 else if (bom == 0xFFFE) {
5284 q += 2;
5285 bo = -1;
5286 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005287#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
Tim Peters772747b2001-08-09 22:21:55 +00005291 if (bo == -1) {
5292 /* force LE */
5293 ihi = 1;
5294 ilo = 0;
5295 }
5296 else if (bo == 1) {
5297 /* force BE */
5298 ihi = 0;
5299 ilo = 1;
5300 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005301#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5302 native_ordering = ilo < ihi;
5303#else
5304 native_ordering = ilo > ihi;
5305#endif
Tim Peters772747b2001-08-09 22:21:55 +00005306
Antoine Pitrouab868312009-01-10 15:40:25 +00005307 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005308 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005309 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005310 /* First check for possible aligned read of a C 'long'. Unaligned
5311 reads are more expensive, better to defer to another iteration. */
5312 if (!((size_t) q & LONG_PTR_MASK)) {
5313 /* Fast path for runs of non-surrogate chars. */
5314 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005315 int kind = PyUnicode_KIND(unicode);
5316 void *data = PyUnicode_DATA(unicode);
5317 while (_q < aligned_end) {
5318 unsigned long block = * (unsigned long *) _q;
5319 unsigned short *pblock = (unsigned short*)&block;
5320 Py_UCS4 maxch;
5321 if (native_ordering) {
5322 /* Can use buffer directly */
5323 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005325 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005326 else {
5327 /* Need to byte-swap */
5328 unsigned char *_p = (unsigned char*)pblock;
5329 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005330 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005331 _p[0] = _q[1];
5332 _p[1] = _q[0];
5333 _p[2] = _q[3];
5334 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005335#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005336 _p[4] = _q[5];
5337 _p[5] = _q[4];
5338 _p[6] = _q[7];
5339 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005340#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005341 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005342 maxch = Py_MAX(pblock[0], pblock[1]);
5343#if SIZEOF_LONG == 8
5344 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5345#endif
5346 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5347 if (unicode_widen(&unicode, maxch) < 0)
5348 goto onError;
5349 kind = PyUnicode_KIND(unicode);
5350 data = PyUnicode_DATA(unicode);
5351 }
5352 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5353 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5354#if SIZEOF_LONG == 8
5355 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5356 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5357#endif
5358 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005359 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005360 q = _q;
5361 if (q >= e)
5362 break;
5363 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365
Benjamin Peterson14339b62009-01-31 16:36:08 +00005366 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005367
Victor Stinner551ac952011-11-29 22:58:13 +01005368 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005369 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5370 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 continue;
5372 }
5373
5374 /* UTF-16 code pair: */
5375 if (q > e) {
5376 errmsg = "unexpected end of data";
5377 startinpos = (((const char *)q) - 2) - starts;
5378 endinpos = ((const char *)e) + 1 - starts;
5379 goto utf16Error;
5380 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005381 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5382 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005384 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005385 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005386 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005387 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 continue;
5389 }
5390 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005391 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 startinpos = (((const char *)q)-4)-starts;
5393 endinpos = startinpos+2;
5394 goto utf16Error;
5395 }
5396
Benjamin Peterson14339b62009-01-31 16:36:08 +00005397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005398 errmsg = "illegal encoding";
5399 startinpos = (((const char *)q)-2)-starts;
5400 endinpos = startinpos+2;
5401 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005402
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005405 errors,
5406 &errorHandler,
5407 "utf16", errmsg,
5408 &starts,
5409 (const char **)&e,
5410 &startinpos,
5411 &endinpos,
5412 &exc,
5413 (const char **)&q,
5414 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005415 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005418 /* remaining byte at the end? (size should be even) */
5419 if (e == q) {
5420 if (!consumed) {
5421 errmsg = "truncated data";
5422 startinpos = ((const char *)q) - starts;
5423 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005424 if (unicode_decode_call_errorhandler(
5425 errors,
5426 &errorHandler,
5427 "utf16", errmsg,
5428 &starts,
5429 (const char **)&e,
5430 &startinpos,
5431 &endinpos,
5432 &exc,
5433 (const char **)&q,
5434 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005435 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005436 goto onError;
5437 /* The remaining input chars are ignored if the callback
5438 chooses to skip the input */
5439 }
5440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441
5442 if (byteorder)
5443 *byteorder = bo;
5444
Walter Dörwald69652032004-09-07 20:24:22 +00005445 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005446 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005447
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005449 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 goto onError;
5451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 Py_XDECREF(errorHandler);
5453 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005454 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 Py_XDECREF(errorHandler);
5459 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 return NULL;
5461}
5462
Antoine Pitrouab868312009-01-10 15:40:25 +00005463#undef FAST_CHAR_MASK
5464#undef SWAPPED_FAST_CHAR_MASK
5465
Tim Peters772747b2001-08-09 22:21:55 +00005466PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005467_PyUnicode_EncodeUTF16(PyObject *str,
5468 const char *errors,
5469 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005471 int kind;
5472 void *data;
5473 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005474 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005475 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005477 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005478 /* Offsets from p for storing byte pairs in the right order. */
5479#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5480 int ihi = 1, ilo = 0;
5481#else
5482 int ihi = 0, ilo = 1;
5483#endif
5484
Benjamin Peterson29060642009-01-31 22:14:21 +00005485#define STORECHAR(CH) \
5486 do { \
5487 p[ihi] = ((CH) >> 8) & 0xff; \
5488 p[ilo] = (CH) & 0xff; \
5489 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005490 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005492 if (!PyUnicode_Check(str)) {
5493 PyErr_BadArgument();
5494 return NULL;
5495 }
5496 if (PyUnicode_READY(str) < 0)
5497 return NULL;
5498 kind = PyUnicode_KIND(str);
5499 data = PyUnicode_DATA(str);
5500 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005501
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005502 pairs = 0;
5503 if (kind == PyUnicode_4BYTE_KIND)
5504 for (i = 0; i < len; i++)
5505 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5506 pairs++;
5507 /* 2 * (len + pairs + (byteorder == 0)) */
5508 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005510 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005511 bytesize = nsize * 2;
5512 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005514 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 if (v == NULL)
5516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005518 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005522 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005523
5524 if (byteorder == -1) {
5525 /* force LE */
5526 ihi = 1;
5527 ilo = 0;
5528 }
5529 else if (byteorder == 1) {
5530 /* force BE */
5531 ihi = 0;
5532 ilo = 1;
5533 }
5534
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005535 for (i = 0; i < len; i++) {
5536 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5537 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005538 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005539 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5540 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 }
Tim Peters772747b2001-08-09 22:21:55 +00005542 STORECHAR(ch);
5543 if (ch2)
5544 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005545 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005546
5547 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005548 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005549#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550}
5551
Alexander Belopolsky40018472011-02-26 01:02:56 +00005552PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005553PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5554 Py_ssize_t size,
5555 const char *errors,
5556 int byteorder)
5557{
5558 PyObject *result;
5559 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5560 if (tmp == NULL)
5561 return NULL;
5562 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5563 Py_DECREF(tmp);
5564 return result;
5565}
5566
5567PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005568PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005570 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571}
5572
5573/* --- Unicode Escape Codec ----------------------------------------------- */
5574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5576 if all the escapes in the string make it still a valid ASCII string.
5577 Returns -1 if any escapes were found which cause the string to
5578 pop out of ASCII range. Otherwise returns the length of the
5579 required buffer to hold the string.
5580 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005581static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5583{
5584 const unsigned char *p = (const unsigned char *)s;
5585 const unsigned char *end = p + size;
5586 Py_ssize_t length = 0;
5587
5588 if (size < 0)
5589 return -1;
5590
5591 for (; p < end; ++p) {
5592 if (*p > 127) {
5593 /* Non-ASCII */
5594 return -1;
5595 }
5596 else if (*p != '\\') {
5597 /* Normal character */
5598 ++length;
5599 }
5600 else {
5601 /* Backslash-escape, check next char */
5602 ++p;
5603 /* Escape sequence reaches till end of string or
5604 non-ASCII follow-up. */
5605 if (p >= end || *p > 127)
5606 return -1;
5607 switch (*p) {
5608 case '\n':
5609 /* backslash + \n result in zero characters */
5610 break;
5611 case '\\': case '\'': case '\"':
5612 case 'b': case 'f': case 't':
5613 case 'n': case 'r': case 'v': case 'a':
5614 ++length;
5615 break;
5616 case '0': case '1': case '2': case '3':
5617 case '4': case '5': case '6': case '7':
5618 case 'x': case 'u': case 'U': case 'N':
5619 /* these do not guarantee ASCII characters */
5620 return -1;
5621 default:
5622 /* count the backslash + the other character */
5623 length += 2;
5624 }
5625 }
5626 }
5627 return length;
5628}
5629
Fredrik Lundh06d12682001-01-24 07:59:11 +00005630static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005631
Alexander Belopolsky40018472011-02-26 01:02:56 +00005632PyObject *
5633PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005634 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005635 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 Py_ssize_t startinpos;
5639 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005640 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005641 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005643 char* message;
5644 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 PyObject *errorHandler = NULL;
5646 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005647 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005649
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005650 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651
5652 /* After length_of_escaped_ascii_string() there are two alternatives,
5653 either the string is pure ASCII with named escapes like \n, etc.
5654 and we determined it's exact size (common case)
5655 or it contains \x, \u, ... escape sequences. then we create a
5656 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005657 if (len >= 0) {
5658 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 if (!v)
5660 goto onError;
5661 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 }
5663 else {
5664 /* Escaped strings will always be longer than the resulting
5665 Unicode string, so we start with size here and then reduce the
5666 length after conversion to the true value.
5667 (but if the error callback returns a long replacement string
5668 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005669 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 if (!v)
5671 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005672 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673 }
5674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005676 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005679
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 while (s < end) {
5681 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005682 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005685 /* The only case in which i == ascii_length is a backslash
5686 followed by a newline. */
5687 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 /* Non-escape characters are interpreted as Unicode ordinals */
5690 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5692 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 continue;
5694 }
5695
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005696 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* \ - Escapes */
5698 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005699 c = *s++;
5700 if (s > end)
5701 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005702
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 /* The only case in which i == ascii_length is a backslash
5704 followed by a newline. */
5705 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005707 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710#define WRITECHAR(ch) \
5711 do { \
5712 if (unicode_putchar(&v, &i, ch) < 0) \
5713 goto onError; \
5714 }while(0)
5715
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717 case '\\': WRITECHAR('\\'); break;
5718 case '\'': WRITECHAR('\''); break;
5719 case '\"': WRITECHAR('\"'); break;
5720 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 case 'f': WRITECHAR('\014'); break;
5723 case 't': WRITECHAR('\t'); break;
5724 case 'n': WRITECHAR('\n'); break;
5725 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005727 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Benjamin Peterson29060642009-01-31 22:14:21 +00005731 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 case '0': case '1': case '2': case '3':
5733 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005734 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005735 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005736 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005737 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005738 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005740 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 break;
5742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* hex escapes */
5744 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005746 digits = 2;
5747 message = "truncated \\xXX escape";
5748 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 digits = 4;
5753 message = "truncated \\uXXXX escape";
5754 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 8;
5759 message = "truncated \\UXXXXXXXX escape";
5760 hexescape:
5761 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 if (s+digits>end) {
5763 endinpos = size;
5764 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 errors, &errorHandler,
5766 "unicodeescape", "end of string in escape sequence",
5767 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005768 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 goto onError;
5770 goto nextByte;
5771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 for (j = 0; j < digits; ++j) {
5773 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005774 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005775 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 errors, &errorHandler,
5778 "unicodeescape", message,
5779 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005780 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005781 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005782 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005784 }
5785 chr = (chr<<4) & ~0xF;
5786 if (c >= '0' && c <= '9')
5787 chr += c - '0';
5788 else if (c >= 'a' && c <= 'f')
5789 chr += 10 + c - 'a';
5790 else
5791 chr += 10 + c - 'A';
5792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005794 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 /* _decoding_error will have already written into the
5796 target buffer. */
5797 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005798 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005800 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005801 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005802 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 errors, &errorHandler,
5806 "unicodeescape", "illegal Unicode character",
5807 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005809 goto onError;
5810 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005811 break;
5812
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005814 case 'N':
5815 message = "malformed \\N character escape";
5816 if (ucnhash_CAPI == NULL) {
5817 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005818 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5819 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005820 if (ucnhash_CAPI == NULL)
5821 goto ucnhashError;
5822 }
5823 if (*s == '{') {
5824 const char *start = s+1;
5825 /* look for the closing brace */
5826 while (*s != '}' && s < end)
5827 s++;
5828 if (s > start && s < end && *s == '}') {
5829 /* found a name. look it up in the unicode database */
5830 message = "unknown Unicode character name";
5831 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005832 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005833 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005834 goto store;
5835 }
5836 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 errors, &errorHandler,
5840 "unicodeescape", message,
5841 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005842 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005843 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005844 break;
5845
5846 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005847 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 message = "\\ at end of string";
5849 s--;
5850 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 errors, &errorHandler,
5853 "unicodeescape", message,
5854 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005855 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005856 goto onError;
5857 }
5858 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005859 WRITECHAR('\\');
5860 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005861 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005862 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005867#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005868
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005869 if (PyUnicode_Resize(&v, i) < 0)
5870 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005871 Py_XDECREF(errorHandler);
5872 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005873 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005874
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005876 PyErr_SetString(
5877 PyExc_UnicodeError,
5878 "\\N escapes not supported (can't load unicodedata module)"
5879 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005880 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 Py_XDECREF(errorHandler);
5882 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005883 return NULL;
5884
Benjamin Peterson29060642009-01-31 22:14:21 +00005885 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 Py_XDECREF(errorHandler);
5888 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 return NULL;
5890}
5891
5892/* Return a Unicode-Escape string version of the Unicode object.
5893
5894 If quotes is true, the string is enclosed in u"" or u'' quotes as
5895 appropriate.
5896
5897*/
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905 int kind;
5906 void *data;
5907 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Thomas Wouters89f507f2006-12-13 04:49:30 +00005909 /* Initial allocation is based on the longest-possible unichr
5910 escape.
5911
5912 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5913 unichr, so in this case it's the longest unichr escape. In
5914 narrow (UTF-16) builds this is five chars per source unichr
5915 since there are two unichrs in the surrogate pair, so in narrow
5916 (UTF-16) builds it's not the longest unichr escape.
5917
5918 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5919 so in the narrow (UTF-16) build case it's the longest unichr
5920 escape.
5921 */
5922
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923 if (!PyUnicode_Check(unicode)) {
5924 PyErr_BadArgument();
5925 return NULL;
5926 }
5927 if (PyUnicode_READY(unicode) < 0)
5928 return NULL;
5929 len = PyUnicode_GET_LENGTH(unicode);
5930 kind = PyUnicode_KIND(unicode);
5931 data = PyUnicode_DATA(unicode);
5932 switch(kind) {
5933 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5934 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5935 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5936 }
5937
5938 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005939 return PyBytes_FromStringAndSize(NULL, 0);
5940
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005943
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 if (repr == NULL)
5949 return NULL;
5950
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005951 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005954 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005955
Walter Dörwald79e913e2007-05-12 11:08:06 +00005956 /* Escape backslashes */
5957 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 *p++ = '\\';
5959 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005960 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005961 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005962
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005963 /* Map 21-bit characters to '\U00xxxxxx' */
5964 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005965 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005966 *p++ = '\\';
5967 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005968 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5969 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5970 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5971 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5972 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5973 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5974 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5975 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005977 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005978
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005980 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 *p++ = '\\';
5982 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005983 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5984 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5985 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5986 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005988
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005989 /* Map special whitespace to '\t', \n', '\r' */
5990 else if (ch == '\t') {
5991 *p++ = '\\';
5992 *p++ = 't';
5993 }
5994 else if (ch == '\n') {
5995 *p++ = '\\';
5996 *p++ = 'n';
5997 }
5998 else if (ch == '\r') {
5999 *p++ = '\\';
6000 *p++ = 'r';
6001 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006002
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006003 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006004 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006006 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006007 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6008 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 /* Copy everything else as-is */
6012 else
6013 *p++ = (char) ch;
6014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006016 assert(p - PyBytes_AS_STRING(repr) > 0);
6017 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6018 return NULL;
6019 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020}
6021
Alexander Belopolsky40018472011-02-26 01:02:56 +00006022PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6024 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 PyObject *result;
6027 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6028 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030 result = PyUnicode_AsUnicodeEscapeString(tmp);
6031 Py_DECREF(tmp);
6032 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033}
6034
6035/* --- Raw Unicode Escape Codec ------------------------------------------- */
6036
Alexander Belopolsky40018472011-02-26 01:02:56 +00006037PyObject *
6038PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006039 Py_ssize_t size,
6040 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006043 Py_ssize_t startinpos;
6044 Py_ssize_t endinpos;
6045 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006046 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 const char *end;
6048 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 PyObject *errorHandler = NULL;
6050 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006051
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 /* Escaped strings will always be longer than the resulting
6053 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 length after conversion to the true value. (But decoding error
6055 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006056 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006060 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006061 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062 end = s + size;
6063 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 unsigned char c;
6065 Py_UCS4 x;
6066 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006067 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 /* Non-escape characters are interpreted as Unicode ordinals */
6070 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006071 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6072 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 startinpos = s-starts;
6076
6077 /* \u-escapes are only interpreted iff the number of leading
6078 backslashes if odd */
6079 bs = s;
6080 for (;s < end;) {
6081 if (*s != '\\')
6082 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006083 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6084 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 }
6086 if (((s - bs) & 1) == 0 ||
6087 s >= end ||
6088 (*s != 'u' && *s != 'U')) {
6089 continue;
6090 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 count = *s=='u' ? 4 : 8;
6093 s++;
6094
6095 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 for (x = 0, i = 0; i < count; ++i, ++s) {
6097 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006098 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 endinpos = s-starts;
6100 if (unicode_decode_call_errorhandler(
6101 errors, &errorHandler,
6102 "rawunicodeescape", "truncated \\uXXXX",
6103 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 goto onError;
6106 goto nextByte;
6107 }
6108 x = (x<<4) & ~0xF;
6109 if (c >= '0' && c <= '9')
6110 x += c - '0';
6111 else if (c >= 'a' && c <= 'f')
6112 x += 10 + c - 'a';
6113 else
6114 x += 10 + c - 'A';
6115 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006116 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006117 if (unicode_putchar(&v, &outpos, x) < 0)
6118 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006119 } else {
6120 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006121 if (unicode_decode_call_errorhandler(
6122 errors, &errorHandler,
6123 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006127 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 nextByte:
6129 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006131 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 Py_XDECREF(errorHandler);
6134 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006135 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006136
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 Py_XDECREF(errorHandler);
6140 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 return NULL;
6142}
6143
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006144
Alexander Belopolsky40018472011-02-26 01:02:56 +00006145PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006148 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 char *p;
6150 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151 Py_ssize_t expandsize, pos;
6152 int kind;
6153 void *data;
6154 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 if (!PyUnicode_Check(unicode)) {
6157 PyErr_BadArgument();
6158 return NULL;
6159 }
6160 if (PyUnicode_READY(unicode) < 0)
6161 return NULL;
6162 kind = PyUnicode_KIND(unicode);
6163 data = PyUnicode_DATA(unicode);
6164 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006165 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6166 bytes, and 1 byte characters 4. */
6167 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006168
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006171
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 if (repr == NULL)
6174 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006176 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006178 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 for (pos = 0; pos < len; pos++) {
6180 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 /* Map 32-bit characters to '\Uxxxxxxxx' */
6182 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006183 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006184 *p++ = '\\';
6185 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006186 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6189 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6190 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6191 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6192 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6193 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197 *p++ = '\\';
6198 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006199 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6202 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* Copy everything else as-is */
6205 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 *p++ = (char) ch;
6207 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006208
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006209 assert(p > q);
6210 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006211 return NULL;
6212 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Alexander Belopolsky40018472011-02-26 01:02:56 +00006215PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6217 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006219 PyObject *result;
6220 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6221 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006222 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006223 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6224 Py_DECREF(tmp);
6225 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006228/* --- Unicode Internal Codec ------------------------------------------- */
6229
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230PyObject *
6231_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 Py_ssize_t size,
6233 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234{
6235 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006236 Py_ssize_t startinpos;
6237 Py_ssize_t endinpos;
6238 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006239 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006240 const char *end;
6241 const char *reason;
6242 PyObject *errorHandler = NULL;
6243 PyObject *exc = NULL;
6244
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006245 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006246 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006247 1))
6248 return NULL;
6249
Thomas Wouters89f507f2006-12-13 04:49:30 +00006250 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006251 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006252 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006254 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006255 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006256 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006257 end = s + size;
6258
6259 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006260 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006261 Py_UCS4 ch;
6262 /* We copy the raw representation one byte at a time because the
6263 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006264 ((char *) &uch)[0] = s[0];
6265 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006266#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006267 ((char *) &uch)[2] = s[2];
6268 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006269#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006270 ch = uch;
6271
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272 /* We have to sanity check the raw data, otherwise doom looms for
6273 some malformed UCS-4 data. */
6274 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006275#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006276 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006277#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006278 end-s < Py_UNICODE_SIZE
6279 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 startinpos = s - starts;
6282 if (end-s < Py_UNICODE_SIZE) {
6283 endinpos = end-starts;
6284 reason = "truncated input";
6285 }
6286 else {
6287 endinpos = s - starts + Py_UNICODE_SIZE;
6288 reason = "illegal code point (> 0x10FFFF)";
6289 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 if (unicode_decode_call_errorhandler(
6291 errors, &errorHandler,
6292 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006293 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006294 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006296 continue;
6297 }
6298
6299 s += Py_UNICODE_SIZE;
6300#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006301 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006303 Py_UNICODE uch2;
6304 ((char *) &uch2)[0] = s[0];
6305 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006306 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006307 {
Victor Stinner551ac952011-11-29 22:58:13 +01006308 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006309 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006310 }
6311 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006312#endif
6313
6314 if (unicode_putchar(&v, &outpos, ch) < 0)
6315 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 }
6317
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006318 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 goto onError;
6320 Py_XDECREF(errorHandler);
6321 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006322 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006323
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 Py_XDECREF(v);
6326 Py_XDECREF(errorHandler);
6327 Py_XDECREF(exc);
6328 return NULL;
6329}
6330
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331/* --- Latin-1 Codec ------------------------------------------------------ */
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333PyObject *
6334PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006335 Py_ssize_t size,
6336 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006339 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340}
6341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343static void
6344make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006346 PyObject *unicode,
6347 Py_ssize_t startpos, Py_ssize_t endpos,
6348 const char *reason)
6349{
6350 if (*exceptionObject == NULL) {
6351 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006353 encoding, unicode, startpos, endpos, reason);
6354 }
6355 else {
6356 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6357 goto onError;
6358 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6359 goto onError;
6360 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6361 goto onError;
6362 return;
6363 onError:
6364 Py_DECREF(*exceptionObject);
6365 *exceptionObject = NULL;
6366 }
6367}
6368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370static void
6371raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006372 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006373 PyObject *unicode,
6374 Py_ssize_t startpos, Py_ssize_t endpos,
6375 const char *reason)
6376{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006377 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006378 encoding, unicode, startpos, endpos, reason);
6379 if (*exceptionObject != NULL)
6380 PyCodec_StrictErrors(*exceptionObject);
6381}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382
6383/* error handling callback helper:
6384 build arguments, call the callback and check the arguments,
6385 put the result into newpos and return the replacement string, which
6386 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006387static PyObject *
6388unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006389 PyObject **errorHandler,
6390 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 Py_ssize_t startpos, Py_ssize_t endpos,
6393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006395 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 PyObject *restuple;
6398 PyObject *resunicode;
6399
6400 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006401 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 }
6405
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 if (PyUnicode_READY(unicode) < 0)
6407 return NULL;
6408 len = PyUnicode_GET_LENGTH(unicode);
6409
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006410 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414
6415 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006420 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 Py_DECREF(restuple);
6422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 &resunicode, newpos)) {
6426 Py_DECREF(restuple);
6427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6430 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6431 Py_DECREF(restuple);
6432 return NULL;
6433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 *newpos = len + *newpos;
6436 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6438 Py_DECREF(restuple);
6439 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 Py_INCREF(resunicode);
6442 Py_DECREF(restuple);
6443 return resunicode;
6444}
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006449 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 /* input state */
6452 Py_ssize_t pos=0, size;
6453 int kind;
6454 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 /* output object */
6456 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 /* pointer into the output */
6458 char *str;
6459 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006461 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6462 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 PyObject *errorHandler = NULL;
6464 PyObject *exc = NULL;
6465 /* the following variable is used for caching string comparisons
6466 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6467 int known_errorHandler = -1;
6468
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 if (PyUnicode_READY(unicode) < 0)
6470 return NULL;
6471 size = PyUnicode_GET_LENGTH(unicode);
6472 kind = PyUnicode_KIND(unicode);
6473 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 /* allocate enough for a simple encoding without
6475 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006477 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006478 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006480 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006481 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 ressize = size;
6483
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006484 while (pos < size) {
6485 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* can we encode this? */
6488 if (c<limit) {
6489 /* no overflow check, because we know that the space is enough */
6490 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006492 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 Py_ssize_t requiredsize;
6495 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 Py_ssize_t collstart = pos;
6499 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 ++collend;
6503 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6504 if (known_errorHandler==-1) {
6505 if ((errors==NULL) || (!strcmp(errors, "strict")))
6506 known_errorHandler = 1;
6507 else if (!strcmp(errors, "replace"))
6508 known_errorHandler = 2;
6509 else if (!strcmp(errors, "ignore"))
6510 known_errorHandler = 3;
6511 else if (!strcmp(errors, "xmlcharrefreplace"))
6512 known_errorHandler = 4;
6513 else
6514 known_errorHandler = 0;
6515 }
6516 switch (known_errorHandler) {
6517 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006518 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 goto onError;
6520 case 2: /* replace */
6521 while (collstart++<collend)
6522 *str++ = '?'; /* fall through */
6523 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 break;
6526 case 4: /* xmlcharrefreplace */
6527 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 /* determine replacement size */
6529 for (i = collstart, repsize = 0; i < collend; ++i) {
6530 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6531 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006543 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006544 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006548 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 if (requiredsize > ressize) {
6550 if (requiredsize<2*ressize)
6551 requiredsize = 2*ressize;
6552 if (_PyBytes_Resize(&res, requiredsize))
6553 goto onError;
6554 str = PyBytes_AS_STRING(res) + respos;
6555 ressize = requiredsize;
6556 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 /* generate replacement */
6558 for (i = collstart; i < collend; ++i) {
6559 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 break;
6563 default:
6564 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006565 encoding, reason, unicode, &exc,
6566 collstart, collend, &newpos);
6567 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6568 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006570 if (PyBytes_Check(repunicode)) {
6571 /* Directly copy bytes result to output. */
6572 repsize = PyBytes_Size(repunicode);
6573 if (repsize > 1) {
6574 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006575 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006576 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6577 Py_DECREF(repunicode);
6578 goto onError;
6579 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006580 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006581 ressize += repsize-1;
6582 }
6583 memcpy(str, PyBytes_AsString(repunicode), repsize);
6584 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006586 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006587 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 /* need more space? (at least enough for what we
6590 have+the replacement+the rest of the string, so
6591 we won't have to check space for encodable characters) */
6592 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006593 repsize = PyUnicode_GET_LENGTH(repunicode);
6594 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 if (requiredsize > ressize) {
6596 if (requiredsize<2*ressize)
6597 requiredsize = 2*ressize;
6598 if (_PyBytes_Resize(&res, requiredsize)) {
6599 Py_DECREF(repunicode);
6600 goto onError;
6601 }
6602 str = PyBytes_AS_STRING(res) + respos;
6603 ressize = requiredsize;
6604 }
6605 /* check if there is anything unencodable in the replacement
6606 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 for (i = 0; repsize-->0; ++i, ++str) {
6608 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006610 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006611 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 Py_DECREF(repunicode);
6613 goto onError;
6614 }
6615 *str = (char)c;
6616 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006617 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006620 }
6621 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006622 /* Resize if we allocated to much */
6623 size = str - PyBytes_AS_STRING(res);
6624 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006625 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006626 if (_PyBytes_Resize(&res, size) < 0)
6627 goto onError;
6628 }
6629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006630 Py_XDECREF(errorHandler);
6631 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006632 return res;
6633
6634 onError:
6635 Py_XDECREF(res);
6636 Py_XDECREF(errorHandler);
6637 Py_XDECREF(exc);
6638 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639}
6640
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
6643PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006644 Py_ssize_t size,
6645 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006647 PyObject *result;
6648 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6649 if (unicode == NULL)
6650 return NULL;
6651 result = unicode_encode_ucs1(unicode, errors, 256);
6652 Py_DECREF(unicode);
6653 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654}
6655
Alexander Belopolsky40018472011-02-26 01:02:56 +00006656PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006657_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
6659 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 PyErr_BadArgument();
6661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006663 if (PyUnicode_READY(unicode) == -1)
6664 return NULL;
6665 /* Fast path: if it is a one-byte string, construct
6666 bytes object directly. */
6667 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6668 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6669 PyUnicode_GET_LENGTH(unicode));
6670 /* Non-Latin-1 characters present. Defer to above function to
6671 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006672 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006673}
6674
6675PyObject*
6676PyUnicode_AsLatin1String(PyObject *unicode)
6677{
6678 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679}
6680
6681/* --- 7-bit ASCII Codec -------------------------------------------------- */
6682
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683PyObject *
6684PyUnicode_DecodeASCII(const char *s,
6685 Py_ssize_t size,
6686 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006689 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006690 int kind;
6691 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006692 Py_ssize_t startinpos;
6693 Py_ssize_t endinpos;
6694 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006696 int has_error;
6697 const unsigned char *p = (const unsigned char *)s;
6698 const unsigned char *end = p + size;
6699 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 PyObject *errorHandler = NULL;
6701 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006702
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703 if (size == 0) {
6704 Py_INCREF(unicode_empty);
6705 return unicode_empty;
6706 }
6707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006709 if (size == 1 && (unsigned char)s[0] < 128)
6710 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006711
Victor Stinner702c7342011-10-05 13:50:52 +02006712 has_error = 0;
6713 while (p < end && !has_error) {
6714 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6715 an explanation. */
6716 if (!((size_t) p & LONG_PTR_MASK)) {
6717 /* Help register allocation */
6718 register const unsigned char *_p = p;
6719 while (_p < aligned_end) {
6720 unsigned long value = *(unsigned long *) _p;
6721 if (value & ASCII_CHAR_MASK) {
6722 has_error = 1;
6723 break;
6724 }
6725 _p += SIZEOF_LONG;
6726 }
6727 if (_p == end)
6728 break;
6729 if (has_error)
6730 break;
6731 p = _p;
6732 }
6733 if (*p & 0x80) {
6734 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006736 }
6737 else {
6738 ++p;
6739 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006740 }
Victor Stinner702c7342011-10-05 13:50:52 +02006741 if (!has_error)
6742 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006743
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006744 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006748 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006749 kind = PyUnicode_KIND(v);
6750 data = PyUnicode_DATA(v);
6751 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006752 e = s + size;
6753 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 register unsigned char c = (unsigned char)*s;
6755 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006756 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 ++s;
6758 }
6759 else {
6760 startinpos = s-starts;
6761 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 if (unicode_decode_call_errorhandler(
6763 errors, &errorHandler,
6764 "ascii", "ordinal not in range(128)",
6765 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006766 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 kind = PyUnicode_KIND(v);
6769 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006772 if (PyUnicode_Resize(&v, outpos) < 0)
6773 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006774 Py_XDECREF(errorHandler);
6775 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006776 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006777 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 Py_XDECREF(errorHandler);
6782 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 return NULL;
6784}
6785
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006786/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006787PyObject *
6788PyUnicode_EncodeASCII(const Py_UNICODE *p,
6789 Py_ssize_t size,
6790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 PyObject *result;
6793 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6794 if (unicode == NULL)
6795 return NULL;
6796 result = unicode_encode_ucs1(unicode, errors, 128);
6797 Py_DECREF(unicode);
6798 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799}
6800
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006802_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
6804 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 PyErr_BadArgument();
6806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808 if (PyUnicode_READY(unicode) == -1)
6809 return NULL;
6810 /* Fast path: if it is an ASCII-only string, construct bytes object
6811 directly. Else defer to above function to raise the exception. */
6812 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6813 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6814 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006815 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006816}
6817
6818PyObject *
6819PyUnicode_AsASCIIString(PyObject *unicode)
6820{
6821 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822}
6823
Victor Stinner99b95382011-07-04 14:23:54 +02006824#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006826/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006827
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006828#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829#define NEED_RETRY
6830#endif
6831
Victor Stinner3a50e702011-10-18 21:21:00 +02006832#ifndef WC_ERR_INVALID_CHARS
6833# define WC_ERR_INVALID_CHARS 0x0080
6834#endif
6835
6836static char*
6837code_page_name(UINT code_page, PyObject **obj)
6838{
6839 *obj = NULL;
6840 if (code_page == CP_ACP)
6841 return "mbcs";
6842 if (code_page == CP_UTF7)
6843 return "CP_UTF7";
6844 if (code_page == CP_UTF8)
6845 return "CP_UTF8";
6846
6847 *obj = PyBytes_FromFormat("cp%u", code_page);
6848 if (*obj == NULL)
6849 return NULL;
6850 return PyBytes_AS_STRING(*obj);
6851}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006852
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006854is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006855{
6856 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 if (!IsDBCSLeadByteEx(code_page, *curr))
6860 return 0;
6861
6862 prev = CharPrevExA(code_page, s, curr, 0);
6863 if (prev == curr)
6864 return 1;
6865 /* FIXME: This code is limited to "true" double-byte encodings,
6866 as it assumes an incomplete character consists of a single
6867 byte. */
6868 if (curr - prev == 2)
6869 return 1;
6870 if (!IsDBCSLeadByteEx(code_page, *prev))
6871 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872 return 0;
6873}
6874
Victor Stinner3a50e702011-10-18 21:21:00 +02006875static DWORD
6876decode_code_page_flags(UINT code_page)
6877{
6878 if (code_page == CP_UTF7) {
6879 /* The CP_UTF7 decoder only supports flags=0 */
6880 return 0;
6881 }
6882 else
6883 return MB_ERR_INVALID_CHARS;
6884}
6885
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006886/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 * Decode a byte string from a Windows code page into unicode object in strict
6888 * mode.
6889 *
6890 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6891 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006892 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006893static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006894decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006895 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 const char *in,
6897 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898{
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006900 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902
6903 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 assert(insize > 0);
6905 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6906 if (outsize <= 0)
6907 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908
6909 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006911 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 if (*v == NULL)
6913 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915 }
6916 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006919 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922 }
6923
6924 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6926 if (outsize <= 0)
6927 goto error;
6928 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006929
Victor Stinner3a50e702011-10-18 21:21:00 +02006930error:
6931 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6932 return -2;
6933 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006935}
6936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937/*
6938 * Decode a byte string from a code page into unicode object with an error
6939 * handler.
6940 *
6941 * Returns consumed size if succeed, or raise a WindowsError or
6942 * UnicodeDecodeError exception and returns -1 on error.
6943 */
6944static int
6945decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006946 PyObject **v,
6947 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 const char *errors)
6949{
6950 const char *startin = in;
6951 const char *endin = in + size;
6952 const DWORD flags = decode_code_page_flags(code_page);
6953 /* Ideally, we should get reason from FormatMessage. This is the Windows
6954 2000 English version of the message. */
6955 const char *reason = "No mapping for the Unicode character exists "
6956 "in the target code page.";
6957 /* each step cannot decode more than 1 character, but a character can be
6958 represented as a surrogate pair */
6959 wchar_t buffer[2], *startout, *out;
6960 int insize, outsize;
6961 PyObject *errorHandler = NULL;
6962 PyObject *exc = NULL;
6963 PyObject *encoding_obj = NULL;
6964 char *encoding;
6965 DWORD err;
6966 int ret = -1;
6967
6968 assert(size > 0);
6969
6970 encoding = code_page_name(code_page, &encoding_obj);
6971 if (encoding == NULL)
6972 return -1;
6973
6974 if (errors == NULL || strcmp(errors, "strict") == 0) {
6975 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6976 UnicodeDecodeError. */
6977 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6978 if (exc != NULL) {
6979 PyCodec_StrictErrors(exc);
6980 Py_CLEAR(exc);
6981 }
6982 goto error;
6983 }
6984
6985 if (*v == NULL) {
6986 /* Create unicode object */
6987 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6988 PyErr_NoMemory();
6989 goto error;
6990 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006991 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 if (*v == NULL)
6993 goto error;
6994 startout = PyUnicode_AS_UNICODE(*v);
6995 }
6996 else {
6997 /* Extend unicode object */
6998 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6999 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7000 PyErr_NoMemory();
7001 goto error;
7002 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 goto error;
7005 startout = PyUnicode_AS_UNICODE(*v) + n;
7006 }
7007
7008 /* Decode the byte string character per character */
7009 out = startout;
7010 while (in < endin)
7011 {
7012 /* Decode a character */
7013 insize = 1;
7014 do
7015 {
7016 outsize = MultiByteToWideChar(code_page, flags,
7017 in, insize,
7018 buffer, Py_ARRAY_LENGTH(buffer));
7019 if (outsize > 0)
7020 break;
7021 err = GetLastError();
7022 if (err != ERROR_NO_UNICODE_TRANSLATION
7023 && err != ERROR_INSUFFICIENT_BUFFER)
7024 {
7025 PyErr_SetFromWindowsErr(0);
7026 goto error;
7027 }
7028 insize++;
7029 }
7030 /* 4=maximum length of a UTF-8 sequence */
7031 while (insize <= 4 && (in + insize) <= endin);
7032
7033 if (outsize <= 0) {
7034 Py_ssize_t startinpos, endinpos, outpos;
7035
7036 startinpos = in - startin;
7037 endinpos = startinpos + 1;
7038 outpos = out - PyUnicode_AS_UNICODE(*v);
7039 if (unicode_decode_call_errorhandler(
7040 errors, &errorHandler,
7041 encoding, reason,
7042 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007043 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 {
7045 goto error;
7046 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007047 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 }
7049 else {
7050 in += insize;
7051 memcpy(out, buffer, outsize * sizeof(wchar_t));
7052 out += outsize;
7053 }
7054 }
7055
7056 /* write a NUL character at the end */
7057 *out = 0;
7058
7059 /* Extend unicode object */
7060 outsize = out - startout;
7061 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007065
7066error:
7067 Py_XDECREF(encoding_obj);
7068 Py_XDECREF(errorHandler);
7069 Py_XDECREF(exc);
7070 return ret;
7071}
7072
Victor Stinner3a50e702011-10-18 21:21:00 +02007073static PyObject *
7074decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 const char *s, Py_ssize_t size,
7076 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077{
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 PyObject *v = NULL;
7079 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 if (code_page < 0) {
7082 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7083 return NULL;
7084 }
7085
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner76a31a62011-11-04 00:05:13 +01007089 do
7090 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007092 if (size > INT_MAX) {
7093 chunk_size = INT_MAX;
7094 final = 0;
7095 done = 0;
7096 }
7097 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007099 {
7100 chunk_size = (int)size;
7101 final = (consumed == NULL);
7102 done = 1;
7103 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
Victor Stinner76a31a62011-11-04 00:05:13 +01007105 /* Skip trailing lead-byte unless 'final' is set */
7106 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7107 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108
Victor Stinner76a31a62011-11-04 00:05:13 +01007109 if (chunk_size == 0 && done) {
7110 if (v != NULL)
7111 break;
7112 Py_INCREF(unicode_empty);
7113 return unicode_empty;
7114 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007115
Victor Stinner76a31a62011-11-04 00:05:13 +01007116
7117 converted = decode_code_page_strict(code_page, &v,
7118 s, chunk_size);
7119 if (converted == -2)
7120 converted = decode_code_page_errors(code_page, &v,
7121 s, chunk_size,
7122 errors);
7123 assert(converted != 0);
7124
7125 if (converted < 0) {
7126 Py_XDECREF(v);
7127 return NULL;
7128 }
7129
7130 if (consumed)
7131 *consumed += converted;
7132
7133 s += converted;
7134 size -= converted;
7135 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007136
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007137 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138}
7139
Alexander Belopolsky40018472011-02-26 01:02:56 +00007140PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007141PyUnicode_DecodeCodePageStateful(int code_page,
7142 const char *s,
7143 Py_ssize_t size,
7144 const char *errors,
7145 Py_ssize_t *consumed)
7146{
7147 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7148}
7149
7150PyObject *
7151PyUnicode_DecodeMBCSStateful(const char *s,
7152 Py_ssize_t size,
7153 const char *errors,
7154 Py_ssize_t *consumed)
7155{
7156 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7157}
7158
7159PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160PyUnicode_DecodeMBCS(const char *s,
7161 Py_ssize_t size,
7162 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7165}
7166
Victor Stinner3a50e702011-10-18 21:21:00 +02007167static DWORD
7168encode_code_page_flags(UINT code_page, const char *errors)
7169{
7170 if (code_page == CP_UTF8) {
7171 if (winver.dwMajorVersion >= 6)
7172 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7173 and later */
7174 return WC_ERR_INVALID_CHARS;
7175 else
7176 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7177 return 0;
7178 }
7179 else if (code_page == CP_UTF7) {
7180 /* CP_UTF7 only supports flags=0 */
7181 return 0;
7182 }
7183 else {
7184 if (errors != NULL && strcmp(errors, "replace") == 0)
7185 return 0;
7186 else
7187 return WC_NO_BEST_FIT_CHARS;
7188 }
7189}
7190
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 * Encode a Unicode string to a Windows code page into a byte string in strict
7193 * mode.
7194 *
7195 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7196 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007198static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007199encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202{
Victor Stinner554f3f02010-06-16 23:33:54 +00007203 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 BOOL *pusedDefaultChar = &usedDefaultChar;
7205 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007207 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007208 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 const DWORD flags = encode_code_page_flags(code_page, NULL);
7210 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007211 /* Create a substring so that we can get the UTF-16 representation
7212 of just the slice under consideration. */
7213 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007216
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007218 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007220 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007221
Victor Stinner2fc507f2011-11-04 20:06:39 +01007222 substring = PyUnicode_Substring(unicode, offset, offset+len);
7223 if (substring == NULL)
7224 return -1;
7225 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7226 if (p == NULL) {
7227 Py_DECREF(substring);
7228 return -1;
7229 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007230
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007231 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 outsize = WideCharToMultiByte(code_page, flags,
7233 p, size,
7234 NULL, 0,
7235 NULL, pusedDefaultChar);
7236 if (outsize <= 0)
7237 goto error;
7238 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007239 if (pusedDefaultChar && *pusedDefaultChar) {
7240 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007242 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007243
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007247 if (*outbytes == NULL) {
7248 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007249 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007250 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007252 }
7253 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 const Py_ssize_t n = PyBytes_Size(*outbytes);
7256 if (outsize > PY_SSIZE_T_MAX - n) {
7257 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007258 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7262 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007264 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007266 }
7267
7268 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 outsize = WideCharToMultiByte(code_page, flags,
7270 p, size,
7271 out, outsize,
7272 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 if (outsize <= 0)
7275 goto error;
7276 if (pusedDefaultChar && *pusedDefaultChar)
7277 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007279
Victor Stinner3a50e702011-10-18 21:21:00 +02007280error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007281 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7283 return -2;
7284 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007285 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007286}
7287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288/*
7289 * Encode a Unicode string to a Windows code page into a byte string using a
7290 * error handler.
7291 *
7292 * Returns consumed characters if succeed, or raise a WindowsError and returns
7293 * -1 on other error.
7294 */
7295static int
7296encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007297 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007298 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007299{
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007301 Py_ssize_t pos = unicode_offset;
7302 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 /* Ideally, we should get reason from FormatMessage. This is the Windows
7304 2000 English version of the message. */
7305 const char *reason = "invalid character";
7306 /* 4=maximum length of a UTF-8 sequence */
7307 char buffer[4];
7308 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7309 Py_ssize_t outsize;
7310 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 PyObject *errorHandler = NULL;
7312 PyObject *exc = NULL;
7313 PyObject *encoding_obj = NULL;
7314 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 PyObject *rep;
7317 int ret = -1;
7318
7319 assert(insize > 0);
7320
7321 encoding = code_page_name(code_page, &encoding_obj);
7322 if (encoding == NULL)
7323 return -1;
7324
7325 if (errors == NULL || strcmp(errors, "strict") == 0) {
7326 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7327 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007328 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 if (exc != NULL) {
7330 PyCodec_StrictErrors(exc);
7331 Py_DECREF(exc);
7332 }
7333 Py_XDECREF(encoding_obj);
7334 return -1;
7335 }
7336
7337 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7338 pusedDefaultChar = &usedDefaultChar;
7339 else
7340 pusedDefaultChar = NULL;
7341
7342 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7343 PyErr_NoMemory();
7344 goto error;
7345 }
7346 outsize = insize * Py_ARRAY_LENGTH(buffer);
7347
7348 if (*outbytes == NULL) {
7349 /* Create string object */
7350 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7351 if (*outbytes == NULL)
7352 goto error;
7353 out = PyBytes_AS_STRING(*outbytes);
7354 }
7355 else {
7356 /* Extend string object */
7357 Py_ssize_t n = PyBytes_Size(*outbytes);
7358 if (n > PY_SSIZE_T_MAX - outsize) {
7359 PyErr_NoMemory();
7360 goto error;
7361 }
7362 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7363 goto error;
7364 out = PyBytes_AS_STRING(*outbytes) + n;
7365 }
7366
7367 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007370 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7371 wchar_t chars[2];
7372 int charsize;
7373 if (ch < 0x10000) {
7374 chars[0] = (wchar_t)ch;
7375 charsize = 1;
7376 }
7377 else {
7378 ch -= 0x10000;
7379 chars[0] = 0xd800 + (ch >> 10);
7380 chars[1] = 0xdc00 + (ch & 0x3ff);
7381 charsize = 2;
7382 }
7383
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007385 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 buffer, Py_ARRAY_LENGTH(buffer),
7387 NULL, pusedDefaultChar);
7388 if (outsize > 0) {
7389 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7390 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007391 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007392 memcpy(out, buffer, outsize);
7393 out += outsize;
7394 continue;
7395 }
7396 }
7397 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7398 PyErr_SetFromWindowsErr(0);
7399 goto error;
7400 }
7401
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 rep = unicode_encode_call_errorhandler(
7403 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007404 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007405 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (rep == NULL)
7407 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007408 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007409
7410 if (PyBytes_Check(rep)) {
7411 outsize = PyBytes_GET_SIZE(rep);
7412 if (outsize != 1) {
7413 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7414 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7415 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7416 Py_DECREF(rep);
7417 goto error;
7418 }
7419 out = PyBytes_AS_STRING(*outbytes) + offset;
7420 }
7421 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7422 out += outsize;
7423 }
7424 else {
7425 Py_ssize_t i;
7426 enum PyUnicode_Kind kind;
7427 void *data;
7428
7429 if (PyUnicode_READY(rep) < 0) {
7430 Py_DECREF(rep);
7431 goto error;
7432 }
7433
7434 outsize = PyUnicode_GET_LENGTH(rep);
7435 if (outsize != 1) {
7436 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7437 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7438 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7439 Py_DECREF(rep);
7440 goto error;
7441 }
7442 out = PyBytes_AS_STRING(*outbytes) + offset;
7443 }
7444 kind = PyUnicode_KIND(rep);
7445 data = PyUnicode_DATA(rep);
7446 for (i=0; i < outsize; i++) {
7447 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7448 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007449 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 encoding, unicode,
7451 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 "unable to encode error handler result to ASCII");
7453 Py_DECREF(rep);
7454 goto error;
7455 }
7456 *out = (unsigned char)ch;
7457 out++;
7458 }
7459 }
7460 Py_DECREF(rep);
7461 }
7462 /* write a NUL byte */
7463 *out = 0;
7464 outsize = out - PyBytes_AS_STRING(*outbytes);
7465 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7466 if (_PyBytes_Resize(outbytes, outsize) < 0)
7467 goto error;
7468 ret = 0;
7469
7470error:
7471 Py_XDECREF(encoding_obj);
7472 Py_XDECREF(errorHandler);
7473 Py_XDECREF(exc);
7474 return ret;
7475}
7476
Victor Stinner3a50e702011-10-18 21:21:00 +02007477static PyObject *
7478encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007479 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 const char *errors)
7481{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007484 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007485 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007486
Victor Stinner2fc507f2011-11-04 20:06:39 +01007487 if (PyUnicode_READY(unicode) < 0)
7488 return NULL;
7489 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007490
Victor Stinner3a50e702011-10-18 21:21:00 +02007491 if (code_page < 0) {
7492 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7493 return NULL;
7494 }
7495
Martin v. Löwis3d325192011-11-04 18:23:06 +01007496 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 return PyBytes_FromStringAndSize(NULL, 0);
7498
Victor Stinner7581cef2011-11-03 22:32:33 +01007499 offset = 0;
7500 do
7501 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007504 chunks. */
7505 if (len > INT_MAX/2) {
7506 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 done = 0;
7508 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007509 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007511 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007512 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007513 done = 1;
7514 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 errors);
7519 if (ret == -2)
7520 ret = encode_code_page_errors(code_page, &outbytes,
7521 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007522 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007523 if (ret < 0) {
7524 Py_XDECREF(outbytes);
7525 return NULL;
7526 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007530 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 return outbytes;
7533}
7534
7535PyObject *
7536PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7537 Py_ssize_t size,
7538 const char *errors)
7539{
Victor Stinner7581cef2011-11-03 22:32:33 +01007540 PyObject *unicode, *res;
7541 unicode = PyUnicode_FromUnicode(p, size);
7542 if (unicode == NULL)
7543 return NULL;
7544 res = encode_code_page(CP_ACP, unicode, errors);
7545 Py_DECREF(unicode);
7546 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007547}
7548
7549PyObject *
7550PyUnicode_EncodeCodePage(int code_page,
7551 PyObject *unicode,
7552 const char *errors)
7553{
Victor Stinner7581cef2011-11-03 22:32:33 +01007554 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007555}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007556
Alexander Belopolsky40018472011-02-26 01:02:56 +00007557PyObject *
7558PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007559{
7560 if (!PyUnicode_Check(unicode)) {
7561 PyErr_BadArgument();
7562 return NULL;
7563 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007564 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007565}
7566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007567#undef NEED_RETRY
7568
Victor Stinner99b95382011-07-04 14:23:54 +02007569#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007570
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571/* --- Character Mapping Codec -------------------------------------------- */
7572
Alexander Belopolsky40018472011-02-26 01:02:56 +00007573PyObject *
7574PyUnicode_DecodeCharmap(const char *s,
7575 Py_ssize_t size,
7576 PyObject *mapping,
7577 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580 Py_ssize_t startinpos;
7581 Py_ssize_t endinpos;
7582 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007584 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 PyObject *errorHandler = NULL;
7587 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007588
Guido van Rossumd57fd912000-03-10 22:53:23 +00007589 /* Default to Latin-1 */
7590 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007597 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007598 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007600 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007601 Py_ssize_t maplen;
7602 enum PyUnicode_Kind kind;
7603 void *data;
7604 Py_UCS4 x;
7605
7606 if (PyUnicode_READY(mapping) < 0)
7607 return NULL;
7608
7609 maplen = PyUnicode_GET_LENGTH(mapping);
7610 data = PyUnicode_DATA(mapping);
7611 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 while (s < e) {
7613 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007616 x = PyUnicode_READ(kind, data, ch);
7617 else
7618 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007620 if (x == 0xfffe)
7621 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 startinpos = s-starts;
7624 endinpos = startinpos+1;
7625 if (unicode_decode_call_errorhandler(
7626 errors, &errorHandler,
7627 "charmap", "character maps to <undefined>",
7628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007629 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 goto onError;
7631 }
7632 continue;
7633 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007634
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007635 if (unicode_putchar(&v, &outpos, x) < 0)
7636 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007638 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007639 }
7640 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 while (s < e) {
7642 unsigned char ch = *s;
7643 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007644
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7646 w = PyLong_FromLong((long)ch);
7647 if (w == NULL)
7648 goto onError;
7649 x = PyObject_GetItem(mapping, w);
7650 Py_DECREF(w);
7651 if (x == NULL) {
7652 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7653 /* No mapping found means: mapping is undefined. */
7654 PyErr_Clear();
7655 x = Py_None;
7656 Py_INCREF(x);
7657 } else
7658 goto onError;
7659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007660
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 /* Apply mapping */
7662 if (PyLong_Check(x)) {
7663 long value = PyLong_AS_LONG(x);
7664 if (value < 0 || value > 65535) {
7665 PyErr_SetString(PyExc_TypeError,
7666 "character mapping must be in range(65536)");
7667 Py_DECREF(x);
7668 goto onError;
7669 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007670 if (unicode_putchar(&v, &outpos, value) < 0)
7671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 }
7673 else if (x == Py_None) {
7674 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 startinpos = s-starts;
7676 endinpos = startinpos+1;
7677 if (unicode_decode_call_errorhandler(
7678 errors, &errorHandler,
7679 "charmap", "character maps to <undefined>",
7680 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007681 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 Py_DECREF(x);
7683 goto onError;
7684 }
7685 Py_DECREF(x);
7686 continue;
7687 }
7688 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007689 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007690
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007691 if (PyUnicode_READY(x) < 0)
7692 goto onError;
7693 targetsize = PyUnicode_GET_LENGTH(x);
7694
7695 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007697 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 PyUnicode_READ_CHAR(x, 0)) < 0)
7699 goto onError;
7700 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 else if (targetsize > 1) {
7702 /* 1-n mapping */
7703 if (targetsize > extrachars) {
7704 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 Py_ssize_t needed = (targetsize - extrachars) + \
7706 (targetsize << 2);
7707 extrachars += needed;
7708 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007709 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 Py_DECREF(x);
7712 goto onError;
7713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007715 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7716 goto onError;
7717 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7718 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 extrachars -= targetsize;
7720 }
7721 /* 1-0 mapping: skip the character */
7722 }
7723 else {
7724 /* wrong return value */
7725 PyErr_SetString(PyExc_TypeError,
7726 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007727 Py_DECREF(x);
7728 goto onError;
7729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 Py_DECREF(x);
7731 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007734 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007736 Py_XDECREF(errorHandler);
7737 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007738 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007739
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 Py_XDECREF(errorHandler);
7742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743 Py_XDECREF(v);
7744 return NULL;
7745}
7746
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747/* Charmap encoding: the lookup table */
7748
Alexander Belopolsky40018472011-02-26 01:02:56 +00007749struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 PyObject_HEAD
7751 unsigned char level1[32];
7752 int count2, count3;
7753 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754};
7755
7756static PyObject*
7757encoding_map_size(PyObject *obj, PyObject* args)
7758{
7759 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762}
7763
7764static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007765 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 PyDoc_STR("Return the size (in bytes) of this object") },
7767 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768};
7769
7770static void
7771encoding_map_dealloc(PyObject* o)
7772{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774}
7775
7776static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 "EncodingMap", /*tp_name*/
7779 sizeof(struct encoding_map), /*tp_basicsize*/
7780 0, /*tp_itemsize*/
7781 /* methods */
7782 encoding_map_dealloc, /*tp_dealloc*/
7783 0, /*tp_print*/
7784 0, /*tp_getattr*/
7785 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007786 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 0, /*tp_repr*/
7788 0, /*tp_as_number*/
7789 0, /*tp_as_sequence*/
7790 0, /*tp_as_mapping*/
7791 0, /*tp_hash*/
7792 0, /*tp_call*/
7793 0, /*tp_str*/
7794 0, /*tp_getattro*/
7795 0, /*tp_setattro*/
7796 0, /*tp_as_buffer*/
7797 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7798 0, /*tp_doc*/
7799 0, /*tp_traverse*/
7800 0, /*tp_clear*/
7801 0, /*tp_richcompare*/
7802 0, /*tp_weaklistoffset*/
7803 0, /*tp_iter*/
7804 0, /*tp_iternext*/
7805 encoding_map_methods, /*tp_methods*/
7806 0, /*tp_members*/
7807 0, /*tp_getset*/
7808 0, /*tp_base*/
7809 0, /*tp_dict*/
7810 0, /*tp_descr_get*/
7811 0, /*tp_descr_set*/
7812 0, /*tp_dictoffset*/
7813 0, /*tp_init*/
7814 0, /*tp_alloc*/
7815 0, /*tp_new*/
7816 0, /*tp_free*/
7817 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818};
7819
7820PyObject*
7821PyUnicode_BuildEncodingMap(PyObject* string)
7822{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 PyObject *result;
7824 struct encoding_map *mresult;
7825 int i;
7826 int need_dict = 0;
7827 unsigned char level1[32];
7828 unsigned char level2[512];
7829 unsigned char *mlevel1, *mlevel2, *mlevel3;
7830 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831 int kind;
7832 void *data;
7833 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 PyErr_BadArgument();
7837 return NULL;
7838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 kind = PyUnicode_KIND(string);
7840 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 memset(level1, 0xFF, sizeof level1);
7842 memset(level2, 0xFF, sizeof level2);
7843
7844 /* If there isn't a one-to-one mapping of NULL to \0,
7845 or if there are non-BMP characters, we need to use
7846 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 need_dict = 1;
7849 for (i = 1; i < 256; i++) {
7850 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 ch = PyUnicode_READ(kind, data, i);
7852 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 need_dict = 1;
7854 break;
7855 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 /* unmapped character */
7858 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 l1 = ch >> 11;
7860 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 if (level1[l1] == 0xFF)
7862 level1[l1] = count2++;
7863 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 }
7866
7867 if (count2 >= 0xFF || count3 >= 0xFF)
7868 need_dict = 1;
7869
7870 if (need_dict) {
7871 PyObject *result = PyDict_New();
7872 PyObject *key, *value;
7873 if (!result)
7874 return NULL;
7875 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007877 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 if (!key || !value)
7879 goto failed1;
7880 if (PyDict_SetItem(result, key, value) == -1)
7881 goto failed1;
7882 Py_DECREF(key);
7883 Py_DECREF(value);
7884 }
7885 return result;
7886 failed1:
7887 Py_XDECREF(key);
7888 Py_XDECREF(value);
7889 Py_DECREF(result);
7890 return NULL;
7891 }
7892
7893 /* Create a three-level trie */
7894 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7895 16*count2 + 128*count3 - 1);
7896 if (!result)
7897 return PyErr_NoMemory();
7898 PyObject_Init(result, &EncodingMapType);
7899 mresult = (struct encoding_map*)result;
7900 mresult->count2 = count2;
7901 mresult->count3 = count3;
7902 mlevel1 = mresult->level1;
7903 mlevel2 = mresult->level23;
7904 mlevel3 = mresult->level23 + 16*count2;
7905 memcpy(mlevel1, level1, 32);
7906 memset(mlevel2, 0xFF, 16*count2);
7907 memset(mlevel3, 0, 128*count3);
7908 count3 = 0;
7909 for (i = 1; i < 256; i++) {
7910 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007911 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912 /* unmapped character */
7913 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 o1 = PyUnicode_READ(kind, data, i)>>11;
7915 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 i2 = 16*mlevel1[o1] + o2;
7917 if (mlevel2[i2] == 0xFF)
7918 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007920 i3 = 128*mlevel2[i2] + o3;
7921 mlevel3[i3] = i;
7922 }
7923 return result;
7924}
7925
7926static int
Victor Stinner22168992011-11-20 17:09:18 +01007927encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928{
7929 struct encoding_map *map = (struct encoding_map*)mapping;
7930 int l1 = c>>11;
7931 int l2 = (c>>7) & 0xF;
7932 int l3 = c & 0x7F;
7933 int i;
7934
Victor Stinner22168992011-11-20 17:09:18 +01007935 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007937 if (c == 0)
7938 return 0;
7939 /* level 1*/
7940 i = map->level1[l1];
7941 if (i == 0xFF) {
7942 return -1;
7943 }
7944 /* level 2*/
7945 i = map->level23[16*i+l2];
7946 if (i == 0xFF) {
7947 return -1;
7948 }
7949 /* level 3 */
7950 i = map->level23[16*map->count2 + 128*i + l3];
7951 if (i == 0) {
7952 return -1;
7953 }
7954 return i;
7955}
7956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957/* Lookup the character ch in the mapping. If the character
7958 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007959 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007960static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007961charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
Christian Heimes217cfd12007-12-02 14:31:20 +00007963 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 PyObject *x;
7965
7966 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 x = PyObject_GetItem(mapping, w);
7969 Py_DECREF(w);
7970 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7972 /* No mapping found means: mapping is undefined. */
7973 PyErr_Clear();
7974 x = Py_None;
7975 Py_INCREF(x);
7976 return x;
7977 } else
7978 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007980 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007982 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 long value = PyLong_AS_LONG(x);
7984 if (value < 0 || value > 255) {
7985 PyErr_SetString(PyExc_TypeError,
7986 "character mapping must be in range(256)");
7987 Py_DECREF(x);
7988 return NULL;
7989 }
7990 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007992 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* wrong return value */
7996 PyErr_Format(PyExc_TypeError,
7997 "character mapping must return integer, bytes or None, not %.400s",
7998 x->ob_type->tp_name);
7999 Py_DECREF(x);
8000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 }
8002}
8003
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008005charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008006{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008007 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8008 /* exponentially overallocate to minimize reallocations */
8009 if (requiredsize < 2*outsize)
8010 requiredsize = 2*outsize;
8011 if (_PyBytes_Resize(outobj, requiredsize))
8012 return -1;
8013 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008014}
8015
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008018} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008019/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008020 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 space is available. Return a new reference to the object that
8022 was put in the output buffer, or Py_None, if the mapping was undefined
8023 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008024 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008025static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008026charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008027 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 PyObject *rep;
8030 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032
Christian Heimes90aa7642007-12-19 02:45:37 +00008033 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 if (res == -1)
8037 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 if (outsize<requiredsize)
8039 if (charmapencode_resize(outobj, outpos, requiredsize))
8040 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008041 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 outstart[(*outpos)++] = (char)res;
8043 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008044 }
8045
8046 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008047 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 Py_DECREF(rep);
8051 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if (PyLong_Check(rep)) {
8054 Py_ssize_t requiredsize = *outpos+1;
8055 if (outsize<requiredsize)
8056 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8057 Py_DECREF(rep);
8058 return enc_EXCEPTION;
8059 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008060 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 else {
8064 const char *repchars = PyBytes_AS_STRING(rep);
8065 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8066 Py_ssize_t requiredsize = *outpos+repsize;
8067 if (outsize<requiredsize)
8068 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8069 Py_DECREF(rep);
8070 return enc_EXCEPTION;
8071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 memcpy(outstart + *outpos, repchars, repsize);
8074 *outpos += repsize;
8075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 Py_DECREF(rep);
8078 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079}
8080
8081/* handle an error in PyUnicode_EncodeCharmap
8082 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008083static int
8084charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008087 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008088 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089{
8090 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008091 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008092 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008093 enum PyUnicode_Kind kind;
8094 void *data;
8095 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t collstartpos = *inpos;
8098 Py_ssize_t collendpos = *inpos+1;
8099 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 char *encoding = "charmap";
8101 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008104 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008106 if (PyUnicode_READY(unicode) < 0)
8107 return -1;
8108 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 /* find all unencodable characters */
8110 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008112 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008113 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008114 val = encoding_map_lookup(ch, mapping);
8115 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 break;
8117 ++collendpos;
8118 continue;
8119 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008120
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008121 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8122 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 if (rep==NULL)
8124 return -1;
8125 else if (rep!=Py_None) {
8126 Py_DECREF(rep);
8127 break;
8128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 }
8132 /* cache callback name lookup
8133 * (if not done yet, i.e. it's the first error) */
8134 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if ((errors==NULL) || (!strcmp(errors, "strict")))
8136 *known_errorHandler = 1;
8137 else if (!strcmp(errors, "replace"))
8138 *known_errorHandler = 2;
8139 else if (!strcmp(errors, "ignore"))
8140 *known_errorHandler = 3;
8141 else if (!strcmp(errors, "xmlcharrefreplace"))
8142 *known_errorHandler = 4;
8143 else
8144 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 }
8146 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008147 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008148 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 return -1;
8150 case 2: /* replace */
8151 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 x = charmapencode_output('?', mapping, res, respos);
8153 if (x==enc_EXCEPTION) {
8154 return -1;
8155 }
8156 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008157 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 return -1;
8159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 }
8161 /* fall through */
8162 case 3: /* ignore */
8163 *inpos = collendpos;
8164 break;
8165 case 4: /* xmlcharrefreplace */
8166 /* generate replacement (temporarily (mis)uses p) */
8167 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 char buffer[2+29+1+1];
8169 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008170 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 for (cp = buffer; *cp; ++cp) {
8172 x = charmapencode_output(*cp, mapping, res, respos);
8173 if (x==enc_EXCEPTION)
8174 return -1;
8175 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008176 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 return -1;
8178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 }
8180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 *inpos = collendpos;
8182 break;
8183 default:
8184 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008185 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008189 if (PyBytes_Check(repunicode)) {
8190 /* Directly copy bytes result to output. */
8191 Py_ssize_t outsize = PyBytes_Size(*res);
8192 Py_ssize_t requiredsize;
8193 repsize = PyBytes_Size(repunicode);
8194 requiredsize = *respos + repsize;
8195 if (requiredsize > outsize)
8196 /* Make room for all additional bytes. */
8197 if (charmapencode_resize(res, respos, requiredsize)) {
8198 Py_DECREF(repunicode);
8199 return -1;
8200 }
8201 memcpy(PyBytes_AsString(*res) + *respos,
8202 PyBytes_AsString(repunicode), repsize);
8203 *respos += repsize;
8204 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008205 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008206 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008208 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008209 if (PyUnicode_READY(repunicode) < 0) {
8210 Py_DECREF(repunicode);
8211 return -1;
8212 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008213 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008214 data = PyUnicode_DATA(repunicode);
8215 kind = PyUnicode_KIND(repunicode);
8216 for (index = 0; index < repsize; index++) {
8217 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8218 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008220 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
8222 }
8223 else if (x==enc_FAILED) {
8224 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008225 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return -1;
8227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 }
8229 *inpos = newpos;
8230 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 }
8232 return 0;
8233}
8234
Alexander Belopolsky40018472011-02-26 01:02:56 +00008235PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008236_PyUnicode_EncodeCharmap(PyObject *unicode,
8237 PyObject *mapping,
8238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 /* output object */
8241 PyObject *res = NULL;
8242 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008243 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008246 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 PyObject *errorHandler = NULL;
8248 PyObject *exc = NULL;
8249 /* the following variable is used for caching string comparisons
8250 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8251 * 3=ignore, 4=xmlcharrefreplace */
8252 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008253
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008254 if (PyUnicode_READY(unicode) < 0)
8255 return NULL;
8256 size = PyUnicode_GET_LENGTH(unicode);
8257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 /* Default to Latin-1 */
8259 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 /* allocate enough for a simple encoding without
8263 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008264 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 if (res == NULL)
8266 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008267 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008271 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008273 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 if (x==enc_EXCEPTION) /* error */
8275 goto onError;
8276 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 &exc,
8279 &known_errorHandler, &errorHandler, errors,
8280 &res, &respos)) {
8281 goto onError;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 else
8285 /* done with this character => adjust input position */
8286 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008291 if (_PyBytes_Resize(&res, respos) < 0)
8292 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 Py_XDECREF(exc);
8295 Py_XDECREF(errorHandler);
8296 return res;
8297
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 Py_XDECREF(res);
8300 Py_XDECREF(exc);
8301 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
8303}
8304
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008305/* Deprecated */
8306PyObject *
8307PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8308 Py_ssize_t size,
8309 PyObject *mapping,
8310 const char *errors)
8311{
8312 PyObject *result;
8313 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8314 if (unicode == NULL)
8315 return NULL;
8316 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8317 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008318 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008319}
8320
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321PyObject *
8322PyUnicode_AsCharmapString(PyObject *unicode,
8323 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
8325 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 PyErr_BadArgument();
8327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008329 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static void
8334make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008336 Py_ssize_t startpos, Py_ssize_t endpos,
8337 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 *exceptionObject = _PyUnicodeTranslateError_Create(
8341 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
8343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8345 goto onError;
8346 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8347 goto onError;
8348 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8349 goto onError;
8350 return;
8351 onError:
8352 Py_DECREF(*exceptionObject);
8353 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355}
8356
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358static void
8359raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361 Py_ssize_t startpos, Py_ssize_t endpos,
8362 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363{
8364 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368}
8369
8370/* error handling callback helper:
8371 build arguments, call the callback and check the arguments,
8372 put the result into newpos and return the replacement string, which
8373 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374static PyObject *
8375unicode_translate_call_errorhandler(const char *errors,
8376 PyObject **errorHandler,
8377 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008379 Py_ssize_t startpos, Py_ssize_t endpos,
8380 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008382 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008384 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject *restuple;
8386 PyObject *resunicode;
8387
8388 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 }
8393
8394 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398
8399 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008404 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 Py_DECREF(restuple);
8406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 }
8408 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 &resunicode, &i_newpos)) {
8410 Py_DECREF(restuple);
8411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008413 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008414 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008415 else
8416 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8419 Py_DECREF(restuple);
8420 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 Py_INCREF(resunicode);
8423 Py_DECREF(restuple);
8424 return resunicode;
8425}
8426
8427/* Lookup the character ch in the mapping and put the result in result,
8428 which must be decrefed by the caller.
8429 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008430static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432{
Christian Heimes217cfd12007-12-02 14:31:20 +00008433 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 PyObject *x;
8435
8436 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 x = PyObject_GetItem(mapping, w);
8439 Py_DECREF(w);
8440 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8442 /* No mapping found means: use 1:1 mapping. */
8443 PyErr_Clear();
8444 *result = NULL;
8445 return 0;
8446 } else
8447 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 }
8449 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 *result = x;
8451 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008453 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 long value = PyLong_AS_LONG(x);
8455 long max = PyUnicode_GetMax();
8456 if (value < 0 || value > max) {
8457 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008458 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 Py_DECREF(x);
8460 return -1;
8461 }
8462 *result = x;
8463 return 0;
8464 }
8465 else if (PyUnicode_Check(x)) {
8466 *result = x;
8467 return 0;
8468 }
8469 else {
8470 /* wrong return value */
8471 PyErr_SetString(PyExc_TypeError,
8472 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008473 Py_DECREF(x);
8474 return -1;
8475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476}
8477/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if not reallocate and adjust various state variables.
8479 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008480static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008485 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 /* exponentially overallocate to minimize reallocations */
8487 if (requiredsize < 2 * oldsize)
8488 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8490 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
8494 return 0;
8495}
8496/* lookup the character, put the result in the output string and adjust
8497 various state variables. Return a new reference to the object that
8498 was put in the output buffer in *result, or Py_None, if the mapping was
8499 undefined (in which case no character was written).
8500 The called must decref result.
8501 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8504 PyObject *mapping, Py_UCS4 **output,
8505 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008506 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8509 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 }
8515 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008517 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 }
8521 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 Py_ssize_t repsize;
8523 if (PyUnicode_READY(*res) == -1)
8524 return -1;
8525 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 if (repsize==1) {
8527 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 }
8530 else if (repsize!=0) {
8531 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 Py_ssize_t requiredsize = *opos +
8533 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 Py_ssize_t i;
8536 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 for(i = 0; i < repsize; i++)
8539 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541 }
8542 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544 return 0;
8545}
8546
Alexander Belopolsky40018472011-02-26 01:02:56 +00008547PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548_PyUnicode_TranslateCharmap(PyObject *input,
8549 PyObject *mapping,
8550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 /* input object */
8553 char *idata;
8554 Py_ssize_t size, i;
8555 int kind;
8556 /* output buffer */
8557 Py_UCS4 *output = NULL;
8558 Py_ssize_t osize;
8559 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 char *reason = "character maps to <undefined>";
8563 PyObject *errorHandler = NULL;
8564 PyObject *exc = NULL;
8565 /* the following variable is used for caching string comparisons
8566 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8567 * 3=ignore, 4=xmlcharrefreplace */
8568 int known_errorHandler = -1;
8569
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 PyErr_BadArgument();
8572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 if (PyUnicode_READY(input) == -1)
8576 return NULL;
8577 idata = (char*)PyUnicode_DATA(input);
8578 kind = PyUnicode_KIND(input);
8579 size = PyUnicode_GET_LENGTH(input);
8580 i = 0;
8581
8582 if (size == 0) {
8583 Py_INCREF(input);
8584 return input;
8585 }
8586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587 /* allocate enough for a simple 1:1 translation without
8588 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 osize = size;
8590 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8591 opos = 0;
8592 if (output == NULL) {
8593 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 /* try to encode it */
8599 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 if (charmaptranslate_output(input, i, mapping,
8601 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 Py_XDECREF(x);
8603 goto onError;
8604 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008605 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 else { /* untranslatable character */
8609 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8610 Py_ssize_t repsize;
8611 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008614 Py_ssize_t collstart = i;
8615 Py_ssize_t collend = i+1;
8616 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 while (collend < size) {
8620 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 goto onError;
8622 Py_XDECREF(x);
8623 if (x!=Py_None)
8624 break;
8625 ++collend;
8626 }
8627 /* cache callback name lookup
8628 * (if not done yet, i.e. it's the first error) */
8629 if (known_errorHandler==-1) {
8630 if ((errors==NULL) || (!strcmp(errors, "strict")))
8631 known_errorHandler = 1;
8632 else if (!strcmp(errors, "replace"))
8633 known_errorHandler = 2;
8634 else if (!strcmp(errors, "ignore"))
8635 known_errorHandler = 3;
8636 else if (!strcmp(errors, "xmlcharrefreplace"))
8637 known_errorHandler = 4;
8638 else
8639 known_errorHandler = 0;
8640 }
8641 switch (known_errorHandler) {
8642 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 raise_translate_exception(&exc, input, collstart,
8644 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008645 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 case 2: /* replace */
8647 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 for (coll = collstart; coll<collend; coll++)
8649 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* fall through */
8651 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 break;
8654 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 /* generate replacement (temporarily (mis)uses i) */
8656 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 char buffer[2+29+1+1];
8658 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8660 if (charmaptranslate_makespace(&output, &osize,
8661 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 goto onError;
8663 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 break;
8668 default:
8669 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 reason, input, &exc,
8671 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008672 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008674 if (PyUnicode_READY(repunicode) < 0) {
8675 Py_DECREF(repunicode);
8676 goto onError;
8677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 repsize = PyUnicode_GET_LENGTH(repunicode);
8680 if (charmaptranslate_makespace(&output, &osize,
8681 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 Py_DECREF(repunicode);
8683 goto onError;
8684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 for (uni2 = 0; repsize-->0; ++uni2)
8686 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8687 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008689 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008690 }
8691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8693 if (!res)
8694 goto onError;
8695 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 Py_XDECREF(exc);
8697 Py_XDECREF(errorHandler);
8698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 Py_XDECREF(exc);
8703 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 return NULL;
8705}
8706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707/* Deprecated. Use PyUnicode_Translate instead. */
8708PyObject *
8709PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8710 Py_ssize_t size,
8711 PyObject *mapping,
8712 const char *errors)
8713{
8714 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8715 if (!unicode)
8716 return NULL;
8717 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8718}
8719
Alexander Belopolsky40018472011-02-26 01:02:56 +00008720PyObject *
8721PyUnicode_Translate(PyObject *str,
8722 PyObject *mapping,
8723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
8725 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008726
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 str = PyUnicode_FromObject(str);
8728 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_DECREF(str);
8732 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008733
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735 Py_XDECREF(str);
8736 return NULL;
8737}
Tim Petersced69f82003-09-16 20:30:58 +00008738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008740fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741{
8742 /* No need to call PyUnicode_READY(self) because this function is only
8743 called as a callback from fixup() which does it already. */
8744 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8745 const int kind = PyUnicode_KIND(self);
8746 void *data = PyUnicode_DATA(self);
8747 Py_UCS4 maxchar = 0, ch, fixed;
8748 Py_ssize_t i;
8749
8750 for (i = 0; i < len; ++i) {
8751 ch = PyUnicode_READ(kind, data, i);
8752 fixed = 0;
8753 if (ch > 127) {
8754 if (Py_UNICODE_ISSPACE(ch))
8755 fixed = ' ';
8756 else {
8757 const int decimal = Py_UNICODE_TODECIMAL(ch);
8758 if (decimal >= 0)
8759 fixed = '0' + decimal;
8760 }
8761 if (fixed != 0) {
8762 if (fixed > maxchar)
8763 maxchar = fixed;
8764 PyUnicode_WRITE(kind, data, i, fixed);
8765 }
8766 else if (ch > maxchar)
8767 maxchar = ch;
8768 }
8769 else if (ch > maxchar)
8770 maxchar = ch;
8771 }
8772
8773 return maxchar;
8774}
8775
8776PyObject *
8777_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8778{
8779 if (!PyUnicode_Check(unicode)) {
8780 PyErr_BadInternalCall();
8781 return NULL;
8782 }
8783 if (PyUnicode_READY(unicode) == -1)
8784 return NULL;
8785 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8786 /* If the string is already ASCII, just return the same string */
8787 Py_INCREF(unicode);
8788 return unicode;
8789 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008790 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791}
8792
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008793PyObject *
8794PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8795 Py_ssize_t length)
8796{
Victor Stinnerf0124502011-11-21 23:12:56 +01008797 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008798 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008799 Py_UCS4 maxchar;
8800 enum PyUnicode_Kind kind;
8801 void *data;
8802
8803 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008804 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008805 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008806 if (ch > 127) {
8807 int decimal = Py_UNICODE_TODECIMAL(ch);
8808 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008809 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008810 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008811 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008812 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008813
8814 /* Copy to a new string */
8815 decimal = PyUnicode_New(length, maxchar);
8816 if (decimal == NULL)
8817 return decimal;
8818 kind = PyUnicode_KIND(decimal);
8819 data = PyUnicode_DATA(decimal);
8820 /* Iterate over code points */
8821 for (i = 0; i < length; i++) {
8822 Py_UNICODE ch = s[i];
8823 if (ch > 127) {
8824 int decimal = Py_UNICODE_TODECIMAL(ch);
8825 if (decimal >= 0)
8826 ch = '0' + decimal;
8827 }
8828 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008830 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008831}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008832/* --- Decimal Encoder ---------------------------------------------------- */
8833
Alexander Belopolsky40018472011-02-26 01:02:56 +00008834int
8835PyUnicode_EncodeDecimal(Py_UNICODE *s,
8836 Py_ssize_t length,
8837 char *output,
8838 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008839{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008840 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008841 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008842 enum PyUnicode_Kind kind;
8843 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008844
8845 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 PyErr_BadArgument();
8847 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008848 }
8849
Victor Stinner42bf7752011-11-21 22:52:58 +01008850 unicode = PyUnicode_FromUnicode(s, length);
8851 if (unicode == NULL)
8852 return -1;
8853
Victor Stinner6345be92011-11-25 20:09:01 +01008854 if (PyUnicode_READY(unicode) < 0) {
8855 Py_DECREF(unicode);
8856 return -1;
8857 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008858 kind = PyUnicode_KIND(unicode);
8859 data = PyUnicode_DATA(unicode);
8860
Victor Stinnerb84d7232011-11-22 01:50:07 +01008861 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008862 PyObject *exc;
8863 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008864 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008865 Py_ssize_t startpos;
8866
8867 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008868
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008870 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008871 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008872 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 decimal = Py_UNICODE_TODECIMAL(ch);
8875 if (decimal >= 0) {
8876 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008877 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 continue;
8879 }
8880 if (0 < ch && ch < 256) {
8881 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008882 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 continue;
8884 }
Victor Stinner6345be92011-11-25 20:09:01 +01008885
Victor Stinner42bf7752011-11-21 22:52:58 +01008886 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008887 exc = NULL;
8888 raise_encode_exception(&exc, "decimal", unicode,
8889 startpos, startpos+1,
8890 "invalid decimal Unicode string");
8891 Py_XDECREF(exc);
8892 Py_DECREF(unicode);
8893 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008894 }
8895 /* 0-terminate the output string */
8896 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008897 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008898 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008899}
8900
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901/* --- Helpers ------------------------------------------------------------ */
8902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008904any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 Py_ssize_t start,
8906 Py_ssize_t end)
8907{
8908 int kind1, kind2, kind;
8909 void *buf1, *buf2;
8910 Py_ssize_t len1, len2, result;
8911
8912 kind1 = PyUnicode_KIND(s1);
8913 kind2 = PyUnicode_KIND(s2);
8914 kind = kind1 > kind2 ? kind1 : kind2;
8915 buf1 = PyUnicode_DATA(s1);
8916 buf2 = PyUnicode_DATA(s2);
8917 if (kind1 != kind)
8918 buf1 = _PyUnicode_AsKind(s1, kind);
8919 if (!buf1)
8920 return -2;
8921 if (kind2 != kind)
8922 buf2 = _PyUnicode_AsKind(s2, kind);
8923 if (!buf2) {
8924 if (kind1 != kind) PyMem_Free(buf1);
8925 return -2;
8926 }
8927 len1 = PyUnicode_GET_LENGTH(s1);
8928 len2 = PyUnicode_GET_LENGTH(s2);
8929
Victor Stinner794d5672011-10-10 03:21:36 +02008930 if (direction > 0) {
8931 switch(kind) {
8932 case PyUnicode_1BYTE_KIND:
8933 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8934 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8935 else
8936 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8937 break;
8938 case PyUnicode_2BYTE_KIND:
8939 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8940 break;
8941 case PyUnicode_4BYTE_KIND:
8942 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8943 break;
8944 default:
8945 assert(0); result = -2;
8946 }
8947 }
8948 else {
8949 switch(kind) {
8950 case PyUnicode_1BYTE_KIND:
8951 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8952 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8953 else
8954 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8955 break;
8956 case PyUnicode_2BYTE_KIND:
8957 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8958 break;
8959 case PyUnicode_4BYTE_KIND:
8960 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8961 break;
8962 default:
8963 assert(0); result = -2;
8964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 }
8966
8967 if (kind1 != kind)
8968 PyMem_Free(buf1);
8969 if (kind2 != kind)
8970 PyMem_Free(buf2);
8971
8972 return result;
8973}
8974
8975Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008976_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 Py_ssize_t n_buffer,
8978 void *digits, Py_ssize_t n_digits,
8979 Py_ssize_t min_width,
8980 const char *grouping,
8981 const char *thousands_sep)
8982{
8983 switch(kind) {
8984 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008985 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8986 return _PyUnicode_ascii_InsertThousandsGrouping(
8987 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8988 min_width, grouping, thousands_sep);
8989 else
8990 return _PyUnicode_ucs1_InsertThousandsGrouping(
8991 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8992 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 case PyUnicode_2BYTE_KIND:
8994 return _PyUnicode_ucs2_InsertThousandsGrouping(
8995 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8996 min_width, grouping, thousands_sep);
8997 case PyUnicode_4BYTE_KIND:
8998 return _PyUnicode_ucs4_InsertThousandsGrouping(
8999 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9000 min_width, grouping, thousands_sep);
9001 }
9002 assert(0);
9003 return -1;
9004}
9005
9006
Thomas Wouters477c8d52006-05-27 19:21:47 +00009007/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009008#define ADJUST_INDICES(start, end, len) \
9009 if (end > len) \
9010 end = len; \
9011 else if (end < 0) { \
9012 end += len; \
9013 if (end < 0) \
9014 end = 0; \
9015 } \
9016 if (start < 0) { \
9017 start += len; \
9018 if (start < 0) \
9019 start = 0; \
9020 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009021
Alexander Belopolsky40018472011-02-26 01:02:56 +00009022Py_ssize_t
9023PyUnicode_Count(PyObject *str,
9024 PyObject *substr,
9025 Py_ssize_t start,
9026 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009028 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009029 PyObject* str_obj;
9030 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 int kind1, kind2, kind;
9032 void *buf1 = NULL, *buf2 = NULL;
9033 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009034
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009035 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009038 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009039 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 Py_DECREF(str_obj);
9041 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Tim Petersced69f82003-09-16 20:30:58 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 kind1 = PyUnicode_KIND(str_obj);
9045 kind2 = PyUnicode_KIND(sub_obj);
9046 kind = kind1 > kind2 ? kind1 : kind2;
9047 buf1 = PyUnicode_DATA(str_obj);
9048 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009049 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 if (!buf1)
9051 goto onError;
9052 buf2 = PyUnicode_DATA(sub_obj);
9053 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009054 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 if (!buf2)
9056 goto onError;
9057 len1 = PyUnicode_GET_LENGTH(str_obj);
9058 len2 = PyUnicode_GET_LENGTH(sub_obj);
9059
9060 ADJUST_INDICES(start, end, len1);
9061 switch(kind) {
9062 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009063 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9064 result = asciilib_count(
9065 ((Py_UCS1*)buf1) + start, end - start,
9066 buf2, len2, PY_SSIZE_T_MAX
9067 );
9068 else
9069 result = ucs1lib_count(
9070 ((Py_UCS1*)buf1) + start, end - start,
9071 buf2, len2, PY_SSIZE_T_MAX
9072 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 break;
9074 case PyUnicode_2BYTE_KIND:
9075 result = ucs2lib_count(
9076 ((Py_UCS2*)buf1) + start, end - start,
9077 buf2, len2, PY_SSIZE_T_MAX
9078 );
9079 break;
9080 case PyUnicode_4BYTE_KIND:
9081 result = ucs4lib_count(
9082 ((Py_UCS4*)buf1) + start, end - start,
9083 buf2, len2, PY_SSIZE_T_MAX
9084 );
9085 break;
9086 default:
9087 assert(0); result = 0;
9088 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009089
9090 Py_DECREF(sub_obj);
9091 Py_DECREF(str_obj);
9092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 if (kind1 != kind)
9094 PyMem_Free(buf1);
9095 if (kind2 != kind)
9096 PyMem_Free(buf2);
9097
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 onError:
9100 Py_DECREF(sub_obj);
9101 Py_DECREF(str_obj);
9102 if (kind1 != kind && buf1)
9103 PyMem_Free(buf1);
9104 if (kind2 != kind && buf2)
9105 PyMem_Free(buf2);
9106 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107}
9108
Alexander Belopolsky40018472011-02-26 01:02:56 +00009109Py_ssize_t
9110PyUnicode_Find(PyObject *str,
9111 PyObject *sub,
9112 Py_ssize_t start,
9113 Py_ssize_t end,
9114 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009116 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009117
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 Py_DECREF(str);
9124 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
Tim Petersced69f82003-09-16 20:30:58 +00009126
Victor Stinner794d5672011-10-10 03:21:36 +02009127 result = any_find_slice(direction,
9128 str, sub, start, end
9129 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009132 Py_DECREF(sub);
9133
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 return result;
9135}
9136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137Py_ssize_t
9138PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9139 Py_ssize_t start, Py_ssize_t end,
9140 int direction)
9141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009143 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 if (PyUnicode_READY(str) == -1)
9145 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009146 if (start < 0 || end < 0) {
9147 PyErr_SetString(PyExc_IndexError, "string index out of range");
9148 return -2;
9149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 if (end > PyUnicode_GET_LENGTH(str))
9151 end = PyUnicode_GET_LENGTH(str);
9152 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009153 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9154 kind, end-start, ch, direction);
9155 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009157 else
9158 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159}
9160
Alexander Belopolsky40018472011-02-26 01:02:56 +00009161static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009162tailmatch(PyObject *self,
9163 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164 Py_ssize_t start,
9165 Py_ssize_t end,
9166 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 int kind_self;
9169 int kind_sub;
9170 void *data_self;
9171 void *data_sub;
9172 Py_ssize_t offset;
9173 Py_ssize_t i;
9174 Py_ssize_t end_sub;
9175
9176 if (PyUnicode_READY(self) == -1 ||
9177 PyUnicode_READY(substring) == -1)
9178 return 0;
9179
9180 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 return 1;
9182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9184 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 kind_self = PyUnicode_KIND(self);
9189 data_self = PyUnicode_DATA(self);
9190 kind_sub = PyUnicode_KIND(substring);
9191 data_sub = PyUnicode_DATA(substring);
9192 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9193
9194 if (direction > 0)
9195 offset = end;
9196 else
9197 offset = start;
9198
9199 if (PyUnicode_READ(kind_self, data_self, offset) ==
9200 PyUnicode_READ(kind_sub, data_sub, 0) &&
9201 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9202 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9203 /* If both are of the same kind, memcmp is sufficient */
9204 if (kind_self == kind_sub) {
9205 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009206 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 data_sub,
9208 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009209 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009210 }
9211 /* otherwise we have to compare each character by first accesing it */
9212 else {
9213 /* We do not need to compare 0 and len(substring)-1 because
9214 the if statement above ensured already that they are equal
9215 when we end up here. */
9216 // TODO: honor direction and do a forward or backwards search
9217 for (i = 1; i < end_sub; ++i) {
9218 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9219 PyUnicode_READ(kind_sub, data_sub, i))
9220 return 0;
9221 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224 }
9225
9226 return 0;
9227}
9228
Alexander Belopolsky40018472011-02-26 01:02:56 +00009229Py_ssize_t
9230PyUnicode_Tailmatch(PyObject *str,
9231 PyObject *substr,
9232 Py_ssize_t start,
9233 Py_ssize_t end,
9234 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009236 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009237
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 str = PyUnicode_FromObject(str);
9239 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 substr = PyUnicode_FromObject(substr);
9242 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 Py_DECREF(str);
9244 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 }
Tim Petersced69f82003-09-16 20:30:58 +00009246
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009247 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009248 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009249 Py_DECREF(str);
9250 Py_DECREF(substr);
9251 return result;
9252}
9253
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254/* Apply fixfct filter to the Unicode object self and return a
9255 reference to the modified object */
9256
Alexander Belopolsky40018472011-02-26 01:02:56 +00009257static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009258fixup(PyObject *self,
9259 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 PyObject *u;
9262 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263
Victor Stinner87af4f22011-11-21 23:03:47 +01009264 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009266 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009267 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 /* fix functions return the new maximum character in a string,
9270 if the kind of the resulting unicode object does not change,
9271 everything is fine. Otherwise we need to change the string kind
9272 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009273 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (maxchar_new == 0)
9275 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9276 else if (maxchar_new <= 127)
9277 maxchar_new = 127;
9278 else if (maxchar_new <= 255)
9279 maxchar_new = 255;
9280 else if (maxchar_new <= 65535)
9281 maxchar_new = 65535;
9282 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009283 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284
9285 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 /* fixfct should return TRUE if it modified the buffer. If
9287 FALSE, return a reference to the original buffer instead
9288 (to save space, not time) */
9289 Py_INCREF(self);
9290 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009291 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 else if (maxchar_new == maxchar_old) {
9294 return u;
9295 }
9296 else {
9297 /* In case the maximum character changed, we need to
9298 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009299 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 if (v == NULL) {
9301 Py_DECREF(u);
9302 return NULL;
9303 }
9304 if (maxchar_new > maxchar_old) {
9305 /* If the maxchar increased so that the kind changed, not all
9306 characters are representable anymore and we need to fix the
9307 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009308 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009309 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9311 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009312 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009313 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315
9316 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009317 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 return v;
9319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320}
9321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009323fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 /* No need to call PyUnicode_READY(self) because this function is only
9326 called as a callback from fixup() which does it already. */
9327 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9328 const int kind = PyUnicode_KIND(self);
9329 void *data = PyUnicode_DATA(self);
9330 int touched = 0;
9331 Py_UCS4 maxchar = 0;
9332 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 for (i = 0; i < len; ++i) {
9335 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9336 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9337 if (up != ch) {
9338 if (up > maxchar)
9339 maxchar = up;
9340 PyUnicode_WRITE(kind, data, i, up);
9341 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 else if (ch > maxchar)
9344 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 }
9346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 if (touched)
9348 return maxchar;
9349 else
9350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351}
9352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009354fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9357 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9358 const int kind = PyUnicode_KIND(self);
9359 void *data = PyUnicode_DATA(self);
9360 int touched = 0;
9361 Py_UCS4 maxchar = 0;
9362 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 for(i = 0; i < len; ++i) {
9365 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9366 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9367 if (lo != ch) {
9368 if (lo > maxchar)
9369 maxchar = lo;
9370 PyUnicode_WRITE(kind, data, i, lo);
9371 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 else if (ch > maxchar)
9374 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 }
9376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (touched)
9378 return maxchar;
9379 else
9380 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381}
9382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009384fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9387 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9388 const int kind = PyUnicode_KIND(self);
9389 void *data = PyUnicode_DATA(self);
9390 int touched = 0;
9391 Py_UCS4 maxchar = 0;
9392 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 for(i = 0; i < len; ++i) {
9395 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9396 Py_UCS4 nu = 0;
9397
9398 if (Py_UNICODE_ISUPPER(ch))
9399 nu = Py_UNICODE_TOLOWER(ch);
9400 else if (Py_UNICODE_ISLOWER(ch))
9401 nu = Py_UNICODE_TOUPPER(ch);
9402
9403 if (nu != 0) {
9404 if (nu > maxchar)
9405 maxchar = nu;
9406 PyUnicode_WRITE(kind, data, i, nu);
9407 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 else if (ch > maxchar)
9410 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411 }
9412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 if (touched)
9414 return maxchar;
9415 else
9416 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417}
9418
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009420fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9423 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9424 const int kind = PyUnicode_KIND(self);
9425 void *data = PyUnicode_DATA(self);
9426 int touched = 0;
9427 Py_UCS4 maxchar = 0;
9428 Py_ssize_t i = 0;
9429 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009430
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009431 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009432 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433
9434 ch = PyUnicode_READ(kind, data, i);
9435 if (!Py_UNICODE_ISUPPER(ch)) {
9436 maxchar = Py_UNICODE_TOUPPER(ch);
9437 PyUnicode_WRITE(kind, data, i, maxchar);
9438 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 ++i;
9441 for(; i < len; ++i) {
9442 ch = PyUnicode_READ(kind, data, i);
9443 if (!Py_UNICODE_ISLOWER(ch)) {
9444 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9445 if (lo > maxchar)
9446 maxchar = lo;
9447 PyUnicode_WRITE(kind, data, i, lo);
9448 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 else if (ch > maxchar)
9451 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453
9454 if (touched)
9455 return maxchar;
9456 else
9457 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458}
9459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009461fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9464 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9465 const int kind = PyUnicode_KIND(self);
9466 void *data = PyUnicode_DATA(self);
9467 Py_UCS4 maxchar = 0;
9468 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 int previous_is_cased;
9470
9471 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 if (len == 1) {
9473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9474 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9475 if (ti != ch) {
9476 PyUnicode_WRITE(kind, data, i, ti);
9477 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 }
9479 else
9480 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 for(; i < len; ++i) {
9484 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9485 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009486
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 nu = Py_UNICODE_TOTITLE(ch);
9491
9492 if (nu > maxchar)
9493 maxchar = nu;
9494 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009495
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 if (Py_UNICODE_ISLOWER(ch) ||
9497 Py_UNICODE_ISUPPER(ch) ||
9498 Py_UNICODE_ISTITLE(ch))
9499 previous_is_cased = 1;
9500 else
9501 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Tim Peters8ce9f162004-08-27 01:49:32 +00009506PyObject *
9507PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009512 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009513 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9514 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009515 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009517 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009519 int use_memcpy;
9520 unsigned char *res_data = NULL, *sep_data = NULL;
9521 PyObject *last_obj;
9522 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523
Tim Peters05eba1f2004-08-27 21:32:02 +00009524 fseq = PySequence_Fast(seq, "");
9525 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009527 }
9528
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009529 /* NOTE: the following code can't call back into Python code,
9530 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009531 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009532
Tim Peters05eba1f2004-08-27 21:32:02 +00009533 seqlen = PySequence_Fast_GET_SIZE(fseq);
9534 /* If empty sequence, return u"". */
9535 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009536 Py_DECREF(fseq);
9537 Py_INCREF(unicode_empty);
9538 res = unicode_empty;
9539 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009540 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009541
Tim Peters05eba1f2004-08-27 21:32:02 +00009542 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009543 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009544 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009545 if (seqlen == 1) {
9546 if (PyUnicode_CheckExact(items[0])) {
9547 res = items[0];
9548 Py_INCREF(res);
9549 Py_DECREF(fseq);
9550 return res;
9551 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009552 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009553 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009554 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009556 /* Set up sep and seplen */
9557 if (separator == NULL) {
9558 /* fall back to a blank space separator */
9559 sep = PyUnicode_FromOrdinal(' ');
9560 if (!sep)
9561 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009562 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009563 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009564 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009565 else {
9566 if (!PyUnicode_Check(separator)) {
9567 PyErr_Format(PyExc_TypeError,
9568 "separator: expected str instance,"
9569 " %.80s found",
9570 Py_TYPE(separator)->tp_name);
9571 goto onError;
9572 }
9573 if (PyUnicode_READY(separator))
9574 goto onError;
9575 sep = separator;
9576 seplen = PyUnicode_GET_LENGTH(separator);
9577 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9578 /* inc refcount to keep this code path symmetric with the
9579 above case of a blank separator */
9580 Py_INCREF(sep);
9581 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009582 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009583 }
9584
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009585 /* There are at least two things to join, or else we have a subclass
9586 * of str in the sequence.
9587 * Do a pre-pass to figure out the total amount of space we'll
9588 * need (sz), and see whether all argument are strings.
9589 */
9590 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009591#ifdef Py_DEBUG
9592 use_memcpy = 0;
9593#else
9594 use_memcpy = 1;
9595#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 for (i = 0; i < seqlen; i++) {
9597 const Py_ssize_t old_sz = sz;
9598 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 if (!PyUnicode_Check(item)) {
9600 PyErr_Format(PyExc_TypeError,
9601 "sequence item %zd: expected str instance,"
9602 " %.80s found",
9603 i, Py_TYPE(item)->tp_name);
9604 goto onError;
9605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 if (PyUnicode_READY(item) == -1)
9607 goto onError;
9608 sz += PyUnicode_GET_LENGTH(item);
9609 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009610 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009611 if (i != 0)
9612 sz += seplen;
9613 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9614 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009616 goto onError;
9617 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 if (use_memcpy && last_obj != NULL) {
9619 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9620 use_memcpy = 0;
9621 }
9622 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009623 }
Tim Petersced69f82003-09-16 20:30:58 +00009624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009626 if (res == NULL)
9627 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009628
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009629 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009630#ifdef Py_DEBUG
9631 use_memcpy = 0;
9632#else
9633 if (use_memcpy) {
9634 res_data = PyUnicode_1BYTE_DATA(res);
9635 kind = PyUnicode_KIND(res);
9636 if (seplen != 0)
9637 sep_data = PyUnicode_1BYTE_DATA(sep);
9638 }
9639#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009641 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009642 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009644 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009645 if (use_memcpy) {
9646 Py_MEMCPY(res_data,
9647 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009648 kind * seplen);
9649 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009650 }
9651 else {
9652 copy_characters(res, res_offset, sep, 0, seplen);
9653 res_offset += seplen;
9654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009656 itemlen = PyUnicode_GET_LENGTH(item);
9657 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009658 if (use_memcpy) {
9659 Py_MEMCPY(res_data,
9660 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009661 kind * itemlen);
9662 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009663 }
9664 else {
9665 copy_characters(res, res_offset, item, 0, itemlen);
9666 res_offset += itemlen;
9667 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009668 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009669 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 if (use_memcpy)
9671 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009672 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009673 else
9674 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009675
Tim Peters05eba1f2004-08-27 21:32:02 +00009676 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009678 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009682 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009684 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 return NULL;
9686}
9687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688#define FILL(kind, data, value, start, length) \
9689 do { \
9690 Py_ssize_t i_ = 0; \
9691 assert(kind != PyUnicode_WCHAR_KIND); \
9692 switch ((kind)) { \
9693 case PyUnicode_1BYTE_KIND: { \
9694 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9695 memset(to_, (unsigned char)value, length); \
9696 break; \
9697 } \
9698 case PyUnicode_2BYTE_KIND: { \
9699 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9700 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9701 break; \
9702 } \
9703 default: { \
9704 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9705 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9706 break; \
9707 } \
9708 } \
9709 } while (0)
9710
Victor Stinner9310abb2011-10-05 00:59:23 +02009711static PyObject *
9712pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009713 Py_ssize_t left,
9714 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 PyObject *u;
9718 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009719 int kind;
9720 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721
9722 if (left < 0)
9723 left = 0;
9724 if (right < 0)
9725 right = 0;
9726
Tim Peters7a29bd52001-09-12 03:03:31 +00009727 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 Py_INCREF(self);
9729 return self;
9730 }
9731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9733 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009734 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9735 return NULL;
9736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9738 if (fill > maxchar)
9739 maxchar = fill;
9740 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009741 if (!u)
9742 return NULL;
9743
9744 kind = PyUnicode_KIND(u);
9745 data = PyUnicode_DATA(u);
9746 if (left)
9747 FILL(kind, data, fill, 0, left);
9748 if (right)
9749 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009750 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009751 assert(_PyUnicode_CheckConsistency(u, 1));
9752 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009754#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755
Alexander Belopolsky40018472011-02-26 01:02:56 +00009756PyObject *
9757PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760
9761 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 switch(PyUnicode_KIND(string)) {
9766 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009767 if (PyUnicode_IS_ASCII(string))
9768 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009769 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 PyUnicode_GET_LENGTH(string), keepends);
9771 else
9772 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009774 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 break;
9776 case PyUnicode_2BYTE_KIND:
9777 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 PyUnicode_GET_LENGTH(string), keepends);
9780 break;
9781 case PyUnicode_4BYTE_KIND:
9782 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 PyUnicode_GET_LENGTH(string), keepends);
9785 break;
9786 default:
9787 assert(0);
9788 list = 0;
9789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 Py_DECREF(string);
9791 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009792}
9793
Alexander Belopolsky40018472011-02-26 01:02:56 +00009794static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009795split(PyObject *self,
9796 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009797 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 int kind1, kind2, kind;
9800 void *buf1, *buf2;
9801 Py_ssize_t len1, len2;
9802 PyObject* out;
9803
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009805 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 if (PyUnicode_READY(self) == -1)
9808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 if (substring == NULL)
9811 switch(PyUnicode_KIND(self)) {
9812 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009813 if (PyUnicode_IS_ASCII(self))
9814 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009816 PyUnicode_GET_LENGTH(self), maxcount
9817 );
9818 else
9819 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 case PyUnicode_2BYTE_KIND:
9824 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
9828 case PyUnicode_4BYTE_KIND:
9829 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 default:
9834 assert(0);
9835 return NULL;
9836 }
9837
9838 if (PyUnicode_READY(substring) == -1)
9839 return NULL;
9840
9841 kind1 = PyUnicode_KIND(self);
9842 kind2 = PyUnicode_KIND(substring);
9843 kind = kind1 > kind2 ? kind1 : kind2;
9844 buf1 = PyUnicode_DATA(self);
9845 buf2 = PyUnicode_DATA(substring);
9846 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 if (!buf1)
9849 return NULL;
9850 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009851 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 if (!buf2) {
9853 if (kind1 != kind) PyMem_Free(buf1);
9854 return NULL;
9855 }
9856 len1 = PyUnicode_GET_LENGTH(self);
9857 len2 = PyUnicode_GET_LENGTH(substring);
9858
9859 switch(kind) {
9860 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009861 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9862 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009863 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009864 else
9865 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 break;
9868 case PyUnicode_2BYTE_KIND:
9869 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 break;
9872 case PyUnicode_4BYTE_KIND:
9873 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009874 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 break;
9876 default:
9877 out = NULL;
9878 }
9879 if (kind1 != kind)
9880 PyMem_Free(buf1);
9881 if (kind2 != kind)
9882 PyMem_Free(buf2);
9883 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884}
9885
Alexander Belopolsky40018472011-02-26 01:02:56 +00009886static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009887rsplit(PyObject *self,
9888 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009889 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 int kind1, kind2, kind;
9892 void *buf1, *buf2;
9893 Py_ssize_t len1, len2;
9894 PyObject* out;
9895
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009896 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009897 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (PyUnicode_READY(self) == -1)
9900 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (substring == NULL)
9903 switch(PyUnicode_KIND(self)) {
9904 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009905 if (PyUnicode_IS_ASCII(self))
9906 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009908 PyUnicode_GET_LENGTH(self), maxcount
9909 );
9910 else
9911 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 case PyUnicode_2BYTE_KIND:
9916 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
9920 case PyUnicode_4BYTE_KIND:
9921 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 default:
9926 assert(0);
9927 return NULL;
9928 }
9929
9930 if (PyUnicode_READY(substring) == -1)
9931 return NULL;
9932
9933 kind1 = PyUnicode_KIND(self);
9934 kind2 = PyUnicode_KIND(substring);
9935 kind = kind1 > kind2 ? kind1 : kind2;
9936 buf1 = PyUnicode_DATA(self);
9937 buf2 = PyUnicode_DATA(substring);
9938 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009939 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (!buf1)
9941 return NULL;
9942 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009943 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (!buf2) {
9945 if (kind1 != kind) PyMem_Free(buf1);
9946 return NULL;
9947 }
9948 len1 = PyUnicode_GET_LENGTH(self);
9949 len2 = PyUnicode_GET_LENGTH(substring);
9950
9951 switch(kind) {
9952 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009953 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9954 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009955 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009956 else
9957 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 break;
9960 case PyUnicode_2BYTE_KIND:
9961 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009962 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 break;
9964 case PyUnicode_4BYTE_KIND:
9965 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009966 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 break;
9968 default:
9969 out = NULL;
9970 }
9971 if (kind1 != kind)
9972 PyMem_Free(buf1);
9973 if (kind2 != kind)
9974 PyMem_Free(buf2);
9975 return out;
9976}
9977
9978static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009979anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9980 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981{
9982 switch(kind) {
9983 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9985 return asciilib_find(buf1, len1, buf2, len2, offset);
9986 else
9987 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_2BYTE_KIND:
9989 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9990 case PyUnicode_4BYTE_KIND:
9991 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9992 }
9993 assert(0);
9994 return -1;
9995}
9996
9997static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9999 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000{
10001 switch(kind) {
10002 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10004 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10005 else
10006 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 case PyUnicode_2BYTE_KIND:
10008 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10009 case PyUnicode_4BYTE_KIND:
10010 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10011 }
10012 assert(0);
10013 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010014}
10015
Alexander Belopolsky40018472011-02-26 01:02:56 +000010016static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017replace(PyObject *self, PyObject *str1,
10018 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 PyObject *u;
10021 char *sbuf = PyUnicode_DATA(self);
10022 char *buf1 = PyUnicode_DATA(str1);
10023 char *buf2 = PyUnicode_DATA(str2);
10024 int srelease = 0, release1 = 0, release2 = 0;
10025 int skind = PyUnicode_KIND(self);
10026 int kind1 = PyUnicode_KIND(str1);
10027 int kind2 = PyUnicode_KIND(str2);
10028 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10029 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10030 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010031 int mayshrink;
10032 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
10034 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010037 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Victor Stinner59de0ee2011-10-07 10:01:28 +020010039 if (str1 == str2)
10040 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 if (skind < kind1)
10042 /* substring too wide to be present */
10043 goto nothing;
10044
Victor Stinner49a0a212011-10-12 23:46:10 +020010045 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10046 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10047 /* Replacing str1 with str2 may cause a maxchar reduction in the
10048 result string. */
10049 mayshrink = (maxchar_str2 < maxchar);
10050 maxchar = Py_MAX(maxchar, maxchar_str2);
10051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010053 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010056 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 Py_UCS4 u1, u2;
10060 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010062 if (findchar(sbuf, PyUnicode_KIND(self),
10063 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010064 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010069 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 rkind = PyUnicode_KIND(u);
10071 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10072 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010073 if (--maxcount < 0)
10074 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 }
10078 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 int rkind = skind;
10080 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 if (kind1 < rkind) {
10083 /* widen substring */
10084 buf1 = _PyUnicode_AsKind(str1, rkind);
10085 if (!buf1) goto error;
10086 release1 = 1;
10087 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010088 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010089 if (i < 0)
10090 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 if (rkind > kind2) {
10092 /* widen replacement */
10093 buf2 = _PyUnicode_AsKind(str2, rkind);
10094 if (!buf2) goto error;
10095 release2 = 1;
10096 }
10097 else if (rkind < kind2) {
10098 /* widen self and buf1 */
10099 rkind = kind2;
10100 if (release1) PyMem_Free(buf1);
10101 sbuf = _PyUnicode_AsKind(self, rkind);
10102 if (!sbuf) goto error;
10103 srelease = 1;
10104 buf1 = _PyUnicode_AsKind(str1, rkind);
10105 if (!buf1) goto error;
10106 release1 = 1;
10107 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 u = PyUnicode_New(slen, maxchar);
10109 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010111 assert(PyUnicode_KIND(u) == rkind);
10112 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010113
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010114 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010115 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010116 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010120
10121 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010122 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010123 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010124 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010125 if (i == -1)
10126 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010127 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010129 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010133 }
10134 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 Py_ssize_t n, i, j, ires;
10136 Py_ssize_t product, new_size;
10137 int rkind = skind;
10138 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010141 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 buf1 = _PyUnicode_AsKind(str1, rkind);
10143 if (!buf1) goto error;
10144 release1 = 1;
10145 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010146 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010147 if (n == 0)
10148 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010150 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 buf2 = _PyUnicode_AsKind(str2, rkind);
10152 if (!buf2) goto error;
10153 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010156 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 rkind = kind2;
10158 sbuf = _PyUnicode_AsKind(self, rkind);
10159 if (!sbuf) goto error;
10160 srelease = 1;
10161 if (release1) PyMem_Free(buf1);
10162 buf1 = _PyUnicode_AsKind(str1, rkind);
10163 if (!buf1) goto error;
10164 release1 = 1;
10165 }
10166 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10167 PyUnicode_GET_LENGTH(str1))); */
10168 product = n * (len2-len1);
10169 if ((product / (len2-len1)) != n) {
10170 PyErr_SetString(PyExc_OverflowError,
10171 "replace string is too long");
10172 goto error;
10173 }
10174 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010175 if (new_size == 0) {
10176 Py_INCREF(unicode_empty);
10177 u = unicode_empty;
10178 goto done;
10179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10181 PyErr_SetString(PyExc_OverflowError,
10182 "replace string is too long");
10183 goto error;
10184 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 u = PyUnicode_New(new_size, maxchar);
10186 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010188 assert(PyUnicode_KIND(u) == rkind);
10189 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 ires = i = 0;
10191 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 while (n-- > 0) {
10193 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010194 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010195 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010197 if (j == -1)
10198 break;
10199 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010200 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010201 memcpy(res + rkind * ires,
10202 sbuf + rkind * i,
10203 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010205 }
10206 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010208 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010210 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010216 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res + rkind * ires,
10218 sbuf + rkind * i,
10219 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010220 }
10221 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 /* interleave */
10223 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010228 if (--n <= 0)
10229 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 memcpy(res + rkind * ires,
10231 sbuf + rkind * i,
10232 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 ires++;
10234 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010235 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010236 memcpy(res + rkind * ires,
10237 sbuf + rkind * i,
10238 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010239 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241
10242 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010243 unicode_adjust_maxchar(&u);
10244 if (u == NULL)
10245 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010247
10248 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (srelease)
10250 PyMem_FREE(sbuf);
10251 if (release1)
10252 PyMem_FREE(buf1);
10253 if (release2)
10254 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010255 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257
Benjamin Peterson29060642009-01-31 22:14:21 +000010258 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 if (srelease)
10261 PyMem_FREE(sbuf);
10262 if (release1)
10263 PyMem_FREE(buf1);
10264 if (release2)
10265 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010266 if (PyUnicode_CheckExact(self)) {
10267 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010268 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010269 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010270 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 error:
10272 if (srelease && sbuf)
10273 PyMem_FREE(sbuf);
10274 if (release1 && buf1)
10275 PyMem_FREE(buf1);
10276 if (release2 && buf2)
10277 PyMem_FREE(buf2);
10278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279}
10280
10281/* --- Unicode Object Methods --------------------------------------------- */
10282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010283PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285\n\
10286Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010287characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288
10289static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010290unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 return fixup(self, fixtitle);
10293}
10294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010295PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010296 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297\n\
10298Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010299have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
10301static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010302unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304 return fixup(self, fixcapitalize);
10305}
10306
10307#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
10311Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010315unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
10317 PyObject *list;
10318 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010319 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321 /* Split into words */
10322 list = split(self, NULL, -1);
10323 if (!list)
10324 return NULL;
10325
10326 /* Capitalize each word */
10327 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010328 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330 if (item == NULL)
10331 goto onError;
10332 Py_DECREF(PyList_GET_ITEM(list, i));
10333 PyList_SET_ITEM(list, i, item);
10334 }
10335
10336 /* Join the words to form a new string */
10337 item = PyUnicode_Join(NULL, list);
10338
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010342}
10343#endif
10344
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010345/* Argument converter. Coerces to a single unicode character */
10346
10347static int
10348convert_uc(PyObject *obj, void *addr)
10349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010351 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010352
Benjamin Peterson14339b62009-01-31 16:36:08 +000010353 uniobj = PyUnicode_FromObject(obj);
10354 if (uniobj == NULL) {
10355 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010356 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 return 0;
10358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010361 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 Py_DECREF(uniobj);
10363 return 0;
10364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 Py_DECREF(uniobj);
10367 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010368}
10369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010370PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010371 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010373Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010374done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375
10376static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010377unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010379 Py_ssize_t marg, left;
10380 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 Py_UCS4 fillchar = ' ';
10382
Victor Stinnere9a29352011-10-01 02:14:59 +020010383 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385
Victor Stinnere9a29352011-10-01 02:14:59 +020010386 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387 return NULL;
10388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 }
10393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 left = marg / 2 + (marg & width & 1);
10396
Victor Stinner9310abb2011-10-05 00:59:23 +020010397 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398}
10399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400/* This function assumes that str1 and str2 are readied by the caller. */
10401
Marc-André Lemburge5034372000-08-08 08:04:29 +000010402static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010403unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010404{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 int kind1, kind2;
10406 void *data1, *data2;
10407 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 kind1 = PyUnicode_KIND(str1);
10410 kind2 = PyUnicode_KIND(str2);
10411 data1 = PyUnicode_DATA(str1);
10412 data2 = PyUnicode_DATA(str2);
10413 len1 = PyUnicode_GET_LENGTH(str1);
10414 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 for (i = 0; i < len1 && i < len2; ++i) {
10417 Py_UCS4 c1, c2;
10418 c1 = PyUnicode_READ(kind1, data1, i);
10419 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010420
10421 if (c1 != c2)
10422 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010423 }
10424
10425 return (len1 < len2) ? -1 : (len1 != len2);
10426}
10427
Alexander Belopolsky40018472011-02-26 01:02:56 +000010428int
10429PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10432 if (PyUnicode_READY(left) == -1 ||
10433 PyUnicode_READY(right) == -1)
10434 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010435 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010437 PyErr_Format(PyExc_TypeError,
10438 "Can't compare %.100s and %.100s",
10439 left->ob_type->tp_name,
10440 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 return -1;
10442}
10443
Martin v. Löwis5b222132007-06-10 09:51:05 +000010444int
10445PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 Py_ssize_t i;
10448 int kind;
10449 void *data;
10450 Py_UCS4 chr;
10451
Victor Stinner910337b2011-10-03 03:20:16 +020010452 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 if (PyUnicode_READY(uni) == -1)
10454 return -1;
10455 kind = PyUnicode_KIND(uni);
10456 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010457 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10459 if (chr != str[i])
10460 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010461 /* This check keeps Python strings that end in '\0' from comparing equal
10462 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010465 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010467 return 0;
10468}
10469
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010470
Benjamin Peterson29060642009-01-31 22:14:21 +000010471#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010472 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010473
Alexander Belopolsky40018472011-02-26 01:02:56 +000010474PyObject *
10475PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010476{
10477 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010478
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010479 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10480 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 if (PyUnicode_READY(left) == -1 ||
10482 PyUnicode_READY(right) == -1)
10483 return NULL;
10484 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10485 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010486 if (op == Py_EQ) {
10487 Py_INCREF(Py_False);
10488 return Py_False;
10489 }
10490 if (op == Py_NE) {
10491 Py_INCREF(Py_True);
10492 return Py_True;
10493 }
10494 }
10495 if (left == right)
10496 result = 0;
10497 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010498 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010499
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010500 /* Convert the return value to a Boolean */
10501 switch (op) {
10502 case Py_EQ:
10503 v = TEST_COND(result == 0);
10504 break;
10505 case Py_NE:
10506 v = TEST_COND(result != 0);
10507 break;
10508 case Py_LE:
10509 v = TEST_COND(result <= 0);
10510 break;
10511 case Py_GE:
10512 v = TEST_COND(result >= 0);
10513 break;
10514 case Py_LT:
10515 v = TEST_COND(result == -1);
10516 break;
10517 case Py_GT:
10518 v = TEST_COND(result == 1);
10519 break;
10520 default:
10521 PyErr_BadArgument();
10522 return NULL;
10523 }
10524 Py_INCREF(v);
10525 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010526 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010527
Brian Curtindfc80e32011-08-10 20:28:54 -050010528 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010529}
10530
Alexander Belopolsky40018472011-02-26 01:02:56 +000010531int
10532PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010533{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010534 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 int kind1, kind2, kind;
10536 void *buf1, *buf2;
10537 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010538 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010539
10540 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 sub = PyUnicode_FromObject(element);
10542 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 PyErr_Format(PyExc_TypeError,
10544 "'in <string>' requires string as left operand, not %s",
10545 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 if (PyUnicode_READY(sub) == -1)
10549 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010550
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010552 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 Py_DECREF(sub);
10554 return -1;
10555 }
10556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 kind1 = PyUnicode_KIND(str);
10558 kind2 = PyUnicode_KIND(sub);
10559 kind = kind1 > kind2 ? kind1 : kind2;
10560 buf1 = PyUnicode_DATA(str);
10561 buf2 = PyUnicode_DATA(sub);
10562 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010563 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 if (!buf1) {
10565 Py_DECREF(sub);
10566 return -1;
10567 }
10568 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (!buf2) {
10571 Py_DECREF(sub);
10572 if (kind1 != kind) PyMem_Free(buf1);
10573 return -1;
10574 }
10575 len1 = PyUnicode_GET_LENGTH(str);
10576 len2 = PyUnicode_GET_LENGTH(sub);
10577
10578 switch(kind) {
10579 case PyUnicode_1BYTE_KIND:
10580 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10581 break;
10582 case PyUnicode_2BYTE_KIND:
10583 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10584 break;
10585 case PyUnicode_4BYTE_KIND:
10586 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10587 break;
10588 default:
10589 result = -1;
10590 assert(0);
10591 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010592
10593 Py_DECREF(str);
10594 Py_DECREF(sub);
10595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (kind1 != kind)
10597 PyMem_Free(buf1);
10598 if (kind2 != kind)
10599 PyMem_Free(buf2);
10600
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010602}
10603
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604/* Concat to string or Unicode object giving a new Unicode object. */
10605
Alexander Belopolsky40018472011-02-26 01:02:56 +000010606PyObject *
10607PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010610 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
10620 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010625 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 }
10629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010631 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10632 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 w = PyUnicode_New(
10636 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10637 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010639 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010640 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10641 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 Py_DECREF(u);
10643 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010644 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010648 Py_XDECREF(u);
10649 Py_XDECREF(v);
10650 return NULL;
10651}
10652
Victor Stinnerb0923652011-10-04 01:17:31 +020010653static void
10654unicode_append_inplace(PyObject **p_left, PyObject *right)
10655{
10656 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010657
10658 assert(PyUnicode_IS_READY(*p_left));
10659 assert(PyUnicode_IS_READY(right));
10660
10661 left_len = PyUnicode_GET_LENGTH(*p_left);
10662 right_len = PyUnicode_GET_LENGTH(right);
10663 if (left_len > PY_SSIZE_T_MAX - right_len) {
10664 PyErr_SetString(PyExc_OverflowError,
10665 "strings are too large to concat");
10666 goto error;
10667 }
10668 new_len = left_len + right_len;
10669
10670 /* Now we own the last reference to 'left', so we can resize it
10671 * in-place.
10672 */
10673 if (unicode_resize(p_left, new_len) != 0) {
10674 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10675 * deallocated so it cannot be put back into
10676 * 'variable'. The MemoryError is raised when there
10677 * is no value in 'variable', which might (very
10678 * remotely) be a cause of incompatibilities.
10679 */
10680 goto error;
10681 }
10682 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010683 copy_characters(*p_left, left_len, right, 0, right_len);
10684 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010685 return;
10686
10687error:
10688 Py_DECREF(*p_left);
10689 *p_left = NULL;
10690}
10691
Walter Dörwald1ab83302007-05-18 17:15:44 +000010692void
Victor Stinner23e56682011-10-03 03:54:37 +020010693PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010694{
Victor Stinner23e56682011-10-03 03:54:37 +020010695 PyObject *left, *res;
10696
10697 if (p_left == NULL) {
10698 if (!PyErr_Occurred())
10699 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010700 return;
10701 }
Victor Stinner23e56682011-10-03 03:54:37 +020010702 left = *p_left;
10703 if (right == NULL || !PyUnicode_Check(left)) {
10704 if (!PyErr_Occurred())
10705 PyErr_BadInternalCall();
10706 goto error;
10707 }
10708
Victor Stinnere1335c72011-10-04 20:53:03 +020010709 if (PyUnicode_READY(left))
10710 goto error;
10711 if (PyUnicode_READY(right))
10712 goto error;
10713
Victor Stinner23e56682011-10-03 03:54:37 +020010714 if (PyUnicode_CheckExact(left) && left != unicode_empty
10715 && PyUnicode_CheckExact(right) && right != unicode_empty
10716 && unicode_resizable(left)
10717 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10718 || _PyUnicode_WSTR(left) != NULL))
10719 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010720 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10721 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010722 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010723 not so different than duplicating the string. */
10724 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010725 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010726 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010727 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010728 return;
10729 }
10730 }
10731
10732 res = PyUnicode_Concat(left, right);
10733 if (res == NULL)
10734 goto error;
10735 Py_DECREF(left);
10736 *p_left = res;
10737 return;
10738
10739error:
10740 Py_DECREF(*p_left);
10741 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010742}
10743
10744void
10745PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10746{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010747 PyUnicode_Append(pleft, right);
10748 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010749}
10750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010751PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010752 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010754Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010755string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010756interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757
10758static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010759unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010761 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010762 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010763 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 int kind1, kind2, kind;
10766 void *buf1, *buf2;
10767 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
Jesus Ceaac451502011-04-20 17:09:23 +020010769 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10770 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010771 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 kind1 = PyUnicode_KIND(self);
10774 kind2 = PyUnicode_KIND(substring);
10775 kind = kind1 > kind2 ? kind1 : kind2;
10776 buf1 = PyUnicode_DATA(self);
10777 buf2 = PyUnicode_DATA(substring);
10778 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010779 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (!buf1) {
10781 Py_DECREF(substring);
10782 return NULL;
10783 }
10784 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010785 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (!buf2) {
10787 Py_DECREF(substring);
10788 if (kind1 != kind) PyMem_Free(buf1);
10789 return NULL;
10790 }
10791 len1 = PyUnicode_GET_LENGTH(self);
10792 len2 = PyUnicode_GET_LENGTH(substring);
10793
10794 ADJUST_INDICES(start, end, len1);
10795 switch(kind) {
10796 case PyUnicode_1BYTE_KIND:
10797 iresult = ucs1lib_count(
10798 ((Py_UCS1*)buf1) + start, end - start,
10799 buf2, len2, PY_SSIZE_T_MAX
10800 );
10801 break;
10802 case PyUnicode_2BYTE_KIND:
10803 iresult = ucs2lib_count(
10804 ((Py_UCS2*)buf1) + start, end - start,
10805 buf2, len2, PY_SSIZE_T_MAX
10806 );
10807 break;
10808 case PyUnicode_4BYTE_KIND:
10809 iresult = ucs4lib_count(
10810 ((Py_UCS4*)buf1) + start, end - start,
10811 buf2, len2, PY_SSIZE_T_MAX
10812 );
10813 break;
10814 default:
10815 assert(0); iresult = 0;
10816 }
10817
10818 result = PyLong_FromSsize_t(iresult);
10819
10820 if (kind1 != kind)
10821 PyMem_Free(buf1);
10822 if (kind2 != kind)
10823 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
10825 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 return result;
10828}
10829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010830PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010831 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010833Encode S using the codec registered for encoding. Default encoding\n\
10834is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010835handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010836a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10837'xmlcharrefreplace' as well as any other name registered with\n\
10838codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010841unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010843 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 char *encoding = NULL;
10845 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010846
Benjamin Peterson308d6372009-09-18 21:42:35 +000010847 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10848 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010850 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010851}
10852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010853PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010854 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855\n\
10856Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010857If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858
10859static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010860unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 Py_ssize_t i, j, line_pos, src_len, incr;
10863 Py_UCS4 ch;
10864 PyObject *u;
10865 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010867 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010868 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
10870 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872
Antoine Pitrou22425222011-10-04 19:10:51 +020010873 if (PyUnicode_READY(self) == -1)
10874 return NULL;
10875
Thomas Wouters7e474022000-07-16 12:04:32 +000010876 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 src_len = PyUnicode_GET_LENGTH(self);
10878 i = j = line_pos = 0;
10879 kind = PyUnicode_KIND(self);
10880 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010881 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 for (; i < src_len; i++) {
10883 ch = PyUnicode_READ(kind, src_data, i);
10884 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010885 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010889 goto overflow;
10890 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010892 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010893 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010895 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010896 goto overflow;
10897 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 if (ch == '\n' || ch == '\r')
10900 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010903 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010904 Py_INCREF(self);
10905 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010906 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010907
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910 if (!u)
10911 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
Antoine Pitroue71d5742011-10-04 15:55:09 +020010914 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 for (; i < src_len; i++) {
10917 ch = PyUnicode_READ(kind, src_data, i);
10918 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 incr = tabsize - (line_pos % tabsize);
10921 line_pos += incr;
10922 while (incr--) {
10923 PyUnicode_WRITE(kind, dest_data, j, ' ');
10924 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010925 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010927 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010928 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010929 line_pos++;
10930 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010931 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010932 if (ch == '\n' || ch == '\r')
10933 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010934 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010935 }
10936 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010937 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010938
Antoine Pitroue71d5742011-10-04 15:55:09 +020010939 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010940 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942}
10943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010945 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946\n\
10947Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010948such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949arguments start and end are interpreted as in slice notation.\n\
10950\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010951Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952
10953static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010956 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010957 Py_ssize_t start;
10958 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
Jesus Ceaac451502011-04-20 17:09:23 +020010961 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10962 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (PyUnicode_READY(self) == -1)
10966 return NULL;
10967 if (PyUnicode_READY(substring) == -1)
10968 return NULL;
10969
Victor Stinner7931d9a2011-11-04 00:22:48 +010010970 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
10972 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 if (result == -2)
10975 return NULL;
10976
Christian Heimes217cfd12007-12-02 14:31:20 +000010977 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978}
10979
10980static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010981unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010983 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10984 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987}
10988
Guido van Rossumc2504932007-09-18 19:42:40 +000010989/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010990 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010991static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010992unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993{
Guido van Rossumc2504932007-09-18 19:42:40 +000010994 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010995 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 if (_PyUnicode_HASH(self) != -1)
10998 return _PyUnicode_HASH(self);
10999 if (PyUnicode_READY(self) == -1)
11000 return -1;
11001 len = PyUnicode_GET_LENGTH(self);
11002
11003 /* The hash function as a macro, gets expanded three times below. */
11004#define HASH(P) \
11005 x = (Py_uhash_t)*P << 7; \
11006 while (--len >= 0) \
11007 x = (1000003*x) ^ (Py_uhash_t)*P++;
11008
11009 switch (PyUnicode_KIND(self)) {
11010 case PyUnicode_1BYTE_KIND: {
11011 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11012 HASH(c);
11013 break;
11014 }
11015 case PyUnicode_2BYTE_KIND: {
11016 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11017 HASH(s);
11018 break;
11019 }
11020 default: {
11021 Py_UCS4 *l;
11022 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11023 "Impossible switch case in unicode_hash");
11024 l = PyUnicode_4BYTE_DATA(self);
11025 HASH(l);
11026 break;
11027 }
11028 }
11029 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11030
Guido van Rossumc2504932007-09-18 19:42:40 +000011031 if (x == -1)
11032 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011034 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011038PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011041Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
11043static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011046 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011047 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011048 Py_ssize_t start;
11049 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050
Jesus Ceaac451502011-04-20 17:09:23 +020011051 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11052 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (PyUnicode_READY(self) == -1)
11056 return NULL;
11057 if (PyUnicode_READY(substring) == -1)
11058 return NULL;
11059
Victor Stinner7931d9a2011-11-04 00:22:48 +010011060 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061
11062 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (result == -2)
11065 return NULL;
11066
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 if (result < 0) {
11068 PyErr_SetString(PyExc_ValueError, "substring not found");
11069 return NULL;
11070 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011071
Christian Heimes217cfd12007-12-02 14:31:20 +000011072 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073}
11074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011078Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
11081static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011082unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 Py_ssize_t i, length;
11085 int kind;
11086 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087 int cased;
11088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (PyUnicode_READY(self) == -1)
11090 return NULL;
11091 length = PyUnicode_GET_LENGTH(self);
11092 kind = PyUnicode_KIND(self);
11093 data = PyUnicode_DATA(self);
11094
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (length == 1)
11097 return PyBool_FromLong(
11098 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011100 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011103
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 for (i = 0; i < length; i++) {
11106 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011107
Benjamin Peterson29060642009-01-31 22:14:21 +000011108 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11109 return PyBool_FromLong(0);
11110 else if (!cased && Py_UNICODE_ISLOWER(ch))
11111 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011113 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114}
11115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011119Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
11122static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011123unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 Py_ssize_t i, length;
11126 int kind;
11127 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128 int cased;
11129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 if (PyUnicode_READY(self) == -1)
11131 return NULL;
11132 length = PyUnicode_GET_LENGTH(self);
11133 kind = PyUnicode_KIND(self);
11134 data = PyUnicode_DATA(self);
11135
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (length == 1)
11138 return PyBool_FromLong(
11139 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011141 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011144
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 for (i = 0; i < length; i++) {
11147 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011148
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11150 return PyBool_FromLong(0);
11151 else if (!cased && Py_UNICODE_ISUPPER(ch))
11152 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011154 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155}
11156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011157PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011160Return True if S is a titlecased string and there is at least one\n\
11161character in S, i.e. upper- and titlecase characters may only\n\
11162follow uncased characters and lowercase characters only cased ones.\n\
11163Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164
11165static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011166unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 Py_ssize_t i, length;
11169 int kind;
11170 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171 int cased, previous_is_cased;
11172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (PyUnicode_READY(self) == -1)
11174 return NULL;
11175 length = PyUnicode_GET_LENGTH(self);
11176 kind = PyUnicode_KIND(self);
11177 data = PyUnicode_DATA(self);
11178
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (length == 1) {
11181 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11182 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11183 (Py_UNICODE_ISUPPER(ch) != 0));
11184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011186 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011189
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 cased = 0;
11191 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 for (i = 0; i < length; i++) {
11193 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011194
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11196 if (previous_is_cased)
11197 return PyBool_FromLong(0);
11198 previous_is_cased = 1;
11199 cased = 1;
11200 }
11201 else if (Py_UNICODE_ISLOWER(ch)) {
11202 if (!previous_is_cased)
11203 return PyBool_FromLong(0);
11204 previous_is_cased = 1;
11205 cased = 1;
11206 }
11207 else
11208 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011210 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211}
11212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011213PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011216Return True if all characters in S are whitespace\n\
11217and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 Py_ssize_t i, length;
11223 int kind;
11224 void *data;
11225
11226 if (PyUnicode_READY(self) == -1)
11227 return NULL;
11228 length = PyUnicode_GET_LENGTH(self);
11229 kind = PyUnicode_KIND(self);
11230 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 if (length == 1)
11234 return PyBool_FromLong(
11235 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011237 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 for (i = 0; i < length; i++) {
11242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011243 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011246 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247}
11248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011251\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011252Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011254
11255static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011256unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 Py_ssize_t i, length;
11259 int kind;
11260 void *data;
11261
11262 if (PyUnicode_READY(self) == -1)
11263 return NULL;
11264 length = PyUnicode_GET_LENGTH(self);
11265 kind = PyUnicode_KIND(self);
11266 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011267
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011268 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 if (length == 1)
11270 return PyBool_FromLong(
11271 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011272
11273 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 for (i = 0; i < length; i++) {
11278 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011280 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011281 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011282}
11283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011284PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011286\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011287Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011288and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011289
11290static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011291unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 int kind;
11294 void *data;
11295 Py_ssize_t len, i;
11296
11297 if (PyUnicode_READY(self) == -1)
11298 return NULL;
11299
11300 kind = PyUnicode_KIND(self);
11301 data = PyUnicode_DATA(self);
11302 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011304 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 if (len == 1) {
11306 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11307 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11308 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309
11310 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 for (i = 0; i < len; i++) {
11315 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011316 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011319 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011320}
11321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011325Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011326False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327
11328static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011329unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 Py_ssize_t i, length;
11332 int kind;
11333 void *data;
11334
11335 if (PyUnicode_READY(self) == -1)
11336 return NULL;
11337 length = PyUnicode_GET_LENGTH(self);
11338 kind = PyUnicode_KIND(self);
11339 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (length == 1)
11343 return PyBool_FromLong(
11344 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011346 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011347 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 for (i = 0; i < length; i++) {
11351 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011354 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355}
11356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011360Return True if all characters in S are digits\n\
11361and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
11363static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011364unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 Py_ssize_t i, length;
11367 int kind;
11368 void *data;
11369
11370 if (PyUnicode_READY(self) == -1)
11371 return NULL;
11372 length = PyUnicode_GET_LENGTH(self);
11373 kind = PyUnicode_KIND(self);
11374 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 if (length == 1) {
11378 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11379 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011382 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011383 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 for (i = 0; i < length; i++) {
11387 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011390 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391}
11392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011396Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011397False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398
11399static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011400unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 Py_ssize_t i, length;
11403 int kind;
11404 void *data;
11405
11406 if (PyUnicode_READY(self) == -1)
11407 return NULL;
11408 length = PyUnicode_GET_LENGTH(self);
11409 kind = PyUnicode_KIND(self);
11410 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (length == 1)
11414 return PyBool_FromLong(
11415 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011417 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 for (i = 0; i < length; i++) {
11422 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011425 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426}
11427
Martin v. Löwis47383402007-08-15 07:32:56 +000011428int
11429PyUnicode_IsIdentifier(PyObject *self)
11430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 int kind;
11432 void *data;
11433 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011434 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (PyUnicode_READY(self) == -1) {
11437 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 }
11440
11441 /* Special case for empty strings */
11442 if (PyUnicode_GET_LENGTH(self) == 0)
11443 return 0;
11444 kind = PyUnicode_KIND(self);
11445 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011446
11447 /* PEP 3131 says that the first character must be in
11448 XID_Start and subsequent characters in XID_Continue,
11449 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011450 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011451 letters, digits, underscore). However, given the current
11452 definition of XID_Start and XID_Continue, it is sufficient
11453 to check just for these, except that _ must be allowed
11454 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011456 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011457 return 0;
11458
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011459 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011462 return 1;
11463}
11464
11465PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011467\n\
11468Return True if S is a valid identifier according\n\
11469to the language definition.");
11470
11471static PyObject*
11472unicode_isidentifier(PyObject *self)
11473{
11474 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11475}
11476
Georg Brandl559e5d72008-06-11 18:37:52 +000011477PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011479\n\
11480Return True if all characters in S are considered\n\
11481printable in repr() or S is empty, False otherwise.");
11482
11483static PyObject*
11484unicode_isprintable(PyObject *self)
11485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 Py_ssize_t i, length;
11487 int kind;
11488 void *data;
11489
11490 if (PyUnicode_READY(self) == -1)
11491 return NULL;
11492 length = PyUnicode_GET_LENGTH(self);
11493 kind = PyUnicode_KIND(self);
11494 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011495
11496 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (length == 1)
11498 return PyBool_FromLong(
11499 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 for (i = 0; i < length; i++) {
11502 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011503 Py_RETURN_FALSE;
11504 }
11505 }
11506 Py_RETURN_TRUE;
11507}
11508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011510 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511\n\
11512Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011513iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
11515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011516unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011518 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519}
11520
Martin v. Löwis18e16552006-02-15 17:27:45 +000011521static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011522unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (PyUnicode_READY(self) == -1)
11525 return -1;
11526 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011532Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011533done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
11535static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011536unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011538 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 Py_UCS4 fillchar = ' ';
11540
11541 if (PyUnicode_READY(self) == -1)
11542 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011543
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011544 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 return NULL;
11546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011549 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 }
11551
Victor Stinner7931d9a2011-11-04 00:22:48 +010011552 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553}
11554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011555PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011558Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
11560static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011561unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 return fixup(self, fixlower);
11564}
11565
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011566#define LEFTSTRIP 0
11567#define RIGHTSTRIP 1
11568#define BOTHSTRIP 2
11569
11570/* Arrays indexed by above */
11571static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11572
11573#define STRIPNAME(i) (stripformat[i]+3)
11574
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011575/* externally visible for str.strip(unicode) */
11576PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011577_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 void *data;
11580 int kind;
11581 Py_ssize_t i, j, len;
11582 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11585 return NULL;
11586
11587 kind = PyUnicode_KIND(self);
11588 data = PyUnicode_DATA(self);
11589 len = PyUnicode_GET_LENGTH(self);
11590 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11591 PyUnicode_DATA(sepobj),
11592 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011593
Benjamin Peterson14339b62009-01-31 16:36:08 +000011594 i = 0;
11595 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 while (i < len &&
11597 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011598 i++;
11599 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011600 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011601
Benjamin Peterson14339b62009-01-31 16:36:08 +000011602 j = len;
11603 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 do {
11605 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 } while (j >= i &&
11607 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011610
Victor Stinner7931d9a2011-11-04 00:22:48 +010011611 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612}
11613
11614PyObject*
11615PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11616{
11617 unsigned char *data;
11618 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011619 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620
Victor Stinnerde636f32011-10-01 03:55:54 +020011621 if (PyUnicode_READY(self) == -1)
11622 return NULL;
11623
11624 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11625
Victor Stinner12bab6d2011-10-01 01:53:49 +020011626 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011628 if (PyUnicode_CheckExact(self)) {
11629 Py_INCREF(self);
11630 return self;
11631 }
11632 else
11633 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 }
11635
Victor Stinner12bab6d2011-10-01 01:53:49 +020011636 length = end - start;
11637 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011638 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639
Victor Stinnerde636f32011-10-01 03:55:54 +020011640 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011641 PyErr_SetString(PyExc_IndexError, "string index out of range");
11642 return NULL;
11643 }
11644
Victor Stinnerb9275c12011-10-05 14:01:42 +020011645 if (PyUnicode_IS_ASCII(self)) {
11646 kind = PyUnicode_KIND(self);
11647 data = PyUnicode_1BYTE_DATA(self);
11648 return unicode_fromascii(data + start, length);
11649 }
11650 else {
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_1BYTE_DATA(self);
11653 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011654 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011655 length);
11656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011660do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 int kind;
11663 void *data;
11664 Py_ssize_t len, i, j;
11665
11666 if (PyUnicode_READY(self) == -1)
11667 return NULL;
11668
11669 kind = PyUnicode_KIND(self);
11670 data = PyUnicode_DATA(self);
11671 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672
Benjamin Peterson14339b62009-01-31 16:36:08 +000011673 i = 0;
11674 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 i++;
11677 }
11678 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 j = len;
11681 if (striptype != LEFTSTRIP) {
11682 do {
11683 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 j++;
11686 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687
Victor Stinner7931d9a2011-11-04 00:22:48 +010011688 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691
11692static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011693do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011695 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11698 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 if (sep != NULL && sep != Py_None) {
11701 if (PyUnicode_Check(sep))
11702 return _PyUnicode_XStrip(self, striptype, sep);
11703 else {
11704 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 "%s arg must be None or str",
11706 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011707 return NULL;
11708 }
11709 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710
Benjamin Peterson14339b62009-01-31 16:36:08 +000011711 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712}
11713
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717\n\
11718Return a copy of the string S with leading and trailing\n\
11719whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011720If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011721
11722static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011723unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011724{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 if (PyTuple_GET_SIZE(args) == 0)
11726 return do_strip(self, BOTHSTRIP); /* Common case */
11727 else
11728 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729}
11730
11731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011732PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011733 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734\n\
11735Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011736If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011737
11738static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011739unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011740{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011741 if (PyTuple_GET_SIZE(args) == 0)
11742 return do_strip(self, LEFTSTRIP); /* Common case */
11743 else
11744 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745}
11746
11747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750\n\
11751Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011752If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753
11754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011755unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011756{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011757 if (PyTuple_GET_SIZE(args) == 0)
11758 return do_strip(self, RIGHTSTRIP); /* Common case */
11759 else
11760 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761}
11762
11763
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011765unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011767 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
Georg Brandl222de0f2009-04-12 12:01:50 +000011770 if (len < 1) {
11771 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011772 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Tim Peters7a29bd52001-09-12 03:03:31 +000011775 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 /* no repeat, return original string */
11777 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011778 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 }
Tim Peters8f422462000-09-09 06:13:41 +000011780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (PyUnicode_READY(str) == -1)
11782 return NULL;
11783
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011784 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011785 PyErr_SetString(PyExc_OverflowError,
11786 "repeated string is too long");
11787 return NULL;
11788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011790
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011791 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 if (!u)
11793 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011794 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (PyUnicode_GET_LENGTH(str) == 1) {
11797 const int kind = PyUnicode_KIND(str);
11798 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11799 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011800 if (kind == PyUnicode_1BYTE_KIND)
11801 memset(to, (unsigned char)fill_char, len);
11802 else {
11803 for (n = 0; n < len; ++n)
11804 PyUnicode_WRITE(kind, to, n, fill_char);
11805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 }
11807 else {
11808 /* number of characters copied this far */
11809 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011810 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 char *to = (char *) PyUnicode_DATA(u);
11812 Py_MEMCPY(to, PyUnicode_DATA(str),
11813 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 n = (done <= nchars-done) ? done : nchars-done;
11816 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011817 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819 }
11820
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011821 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823}
11824
Alexander Belopolsky40018472011-02-26 01:02:56 +000011825PyObject *
11826PyUnicode_Replace(PyObject *obj,
11827 PyObject *subobj,
11828 PyObject *replobj,
11829 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830{
11831 PyObject *self;
11832 PyObject *str1;
11833 PyObject *str2;
11834 PyObject *result;
11835
11836 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011837 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011840 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 Py_DECREF(self);
11842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 }
11844 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011845 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 Py_DECREF(self);
11847 Py_DECREF(str1);
11848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 Py_DECREF(self);
11852 Py_DECREF(str1);
11853 Py_DECREF(str2);
11854 return result;
11855}
11856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011857PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011858 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859\n\
11860Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011861old replaced by new. If the optional argument count is\n\
11862given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863
11864static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 PyObject *str1;
11868 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011869 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870 PyObject *result;
11871
Martin v. Löwis18e16552006-02-15 17:27:45 +000011872 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 str1 = PyUnicode_FromObject(str1);
11877 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11878 return NULL;
11879 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011880 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 Py_DECREF(str1);
11882 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011883 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
11885 result = replace(self, str1, str2, maxcount);
11886
11887 Py_DECREF(str1);
11888 Py_DECREF(str2);
11889 return result;
11890}
11891
Alexander Belopolsky40018472011-02-26 01:02:56 +000011892static PyObject *
11893unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011895 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 Py_ssize_t isize;
11897 Py_ssize_t osize, squote, dquote, i, o;
11898 Py_UCS4 max, quote;
11899 int ikind, okind;
11900 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011903 return NULL;
11904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 isize = PyUnicode_GET_LENGTH(unicode);
11906 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 /* Compute length of output, quote characters, and
11909 maximum character */
11910 osize = 2; /* quotes */
11911 max = 127;
11912 squote = dquote = 0;
11913 ikind = PyUnicode_KIND(unicode);
11914 for (i = 0; i < isize; i++) {
11915 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11916 switch (ch) {
11917 case '\'': squote++; osize++; break;
11918 case '"': dquote++; osize++; break;
11919 case '\\': case '\t': case '\r': case '\n':
11920 osize += 2; break;
11921 default:
11922 /* Fast-path ASCII */
11923 if (ch < ' ' || ch == 0x7f)
11924 osize += 4; /* \xHH */
11925 else if (ch < 0x7f)
11926 osize++;
11927 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11928 osize++;
11929 max = ch > max ? ch : max;
11930 }
11931 else if (ch < 0x100)
11932 osize += 4; /* \xHH */
11933 else if (ch < 0x10000)
11934 osize += 6; /* \uHHHH */
11935 else
11936 osize += 10; /* \uHHHHHHHH */
11937 }
11938 }
11939
11940 quote = '\'';
11941 if (squote) {
11942 if (dquote)
11943 /* Both squote and dquote present. Use squote,
11944 and escape them */
11945 osize += squote;
11946 else
11947 quote = '"';
11948 }
11949
11950 repr = PyUnicode_New(osize, max);
11951 if (repr == NULL)
11952 return NULL;
11953 okind = PyUnicode_KIND(repr);
11954 odata = PyUnicode_DATA(repr);
11955
11956 PyUnicode_WRITE(okind, odata, 0, quote);
11957 PyUnicode_WRITE(okind, odata, osize-1, quote);
11958
11959 for (i = 0, o = 1; i < isize; i++) {
11960 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011961
11962 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if ((ch == quote) || (ch == '\\')) {
11964 PyUnicode_WRITE(okind, odata, o++, '\\');
11965 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011966 continue;
11967 }
11968
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011970 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 PyUnicode_WRITE(okind, odata, o++, '\\');
11972 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011973 }
11974 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 PyUnicode_WRITE(okind, odata, o++, '\\');
11976 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011977 }
11978 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 PyUnicode_WRITE(okind, odata, o++, '\\');
11980 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011981 }
11982
11983 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011984 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 PyUnicode_WRITE(okind, odata, o++, '\\');
11986 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011987 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11988 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011989 }
11990
Georg Brandl559e5d72008-06-11 18:37:52 +000011991 /* Copy ASCII characters as-is */
11992 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011994 }
11995
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011997 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011999 (categories Z* and C* except ASCII space)
12000 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012002 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (ch <= 0xff) {
12004 PyUnicode_WRITE(okind, odata, o++, '\\');
12005 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012006 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12007 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012008 }
12009 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 else if (ch >= 0x10000) {
12011 PyUnicode_WRITE(okind, odata, o++, '\\');
12012 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012013 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12014 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12015 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12019 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12020 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012021 }
12022 /* Map 16-bit characters to '\uxxxx' */
12023 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 PyUnicode_WRITE(okind, odata, o++, '\\');
12025 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012026 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12027 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12028 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12029 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012030 }
12031 }
12032 /* Copy characters as-is */
12033 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012035 }
12036 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012039 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012040 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041}
12042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012043PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045\n\
12046Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012047such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048arguments start and end are interpreted as in slice notation.\n\
12049\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012050Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051
12052static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012055 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012056 Py_ssize_t start;
12057 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012058 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059
Jesus Ceaac451502011-04-20 17:09:23 +020012060 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12061 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066 if (PyUnicode_READY(substring) == -1)
12067 return NULL;
12068
Victor Stinner7931d9a2011-11-04 00:22:48 +010012069 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
12071 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 if (result == -2)
12074 return NULL;
12075
Christian Heimes217cfd12007-12-02 14:31:20 +000012076 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077}
12078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012080 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012087 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012088 Py_ssize_t start;
12089 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012090 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
Jesus Ceaac451502011-04-20 17:09:23 +020012092 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12093 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (PyUnicode_READY(self) == -1)
12097 return NULL;
12098 if (PyUnicode_READY(substring) == -1)
12099 return NULL;
12100
Victor Stinner7931d9a2011-11-04 00:22:48 +010012101 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (result == -2)
12106 return NULL;
12107
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 if (result < 0) {
12109 PyErr_SetString(PyExc_ValueError, "substring not found");
12110 return NULL;
12111 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112
Christian Heimes217cfd12007-12-02 14:31:20 +000012113 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114}
12115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012116PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012119Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012120done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121
12122static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012123unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012125 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 Py_UCS4 fillchar = ' ';
12127
Victor Stinnere9a29352011-10-01 02:14:59 +020012128 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012130
Victor Stinnere9a29352011-10-01 02:14:59 +020012131 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 return NULL;
12133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012136 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 }
12138
Victor Stinner7931d9a2011-11-04 00:22:48 +010012139 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140}
12141
Alexander Belopolsky40018472011-02-26 01:02:56 +000012142PyObject *
12143PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
12145 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012146
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147 s = PyUnicode_FromObject(s);
12148 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012150 if (sep != NULL) {
12151 sep = PyUnicode_FromObject(sep);
12152 if (sep == NULL) {
12153 Py_DECREF(s);
12154 return NULL;
12155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 }
12157
Victor Stinner9310abb2011-10-05 00:59:23 +020012158 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
12160 Py_DECREF(s);
12161 Py_XDECREF(sep);
12162 return result;
12163}
12164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012165PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167\n\
12168Return a list of the words in S, using sep as the\n\
12169delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012170splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012171whitespace string is a separator and empty strings are\n\
12172removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
12174static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012175unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176{
12177 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012178 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Martin v. Löwis18e16552006-02-15 17:27:45 +000012180 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 return NULL;
12182
12183 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012184 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012186 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012188 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189}
12190
Thomas Wouters477c8d52006-05-27 19:21:47 +000012191PyObject *
12192PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12193{
12194 PyObject* str_obj;
12195 PyObject* sep_obj;
12196 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 int kind1, kind2, kind;
12198 void *buf1 = NULL, *buf2 = NULL;
12199 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012200
12201 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012202 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012204 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012206 Py_DECREF(str_obj);
12207 return NULL;
12208 }
12209
Victor Stinner14f8f022011-10-05 20:58:25 +020012210 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012212 kind = Py_MAX(kind1, kind2);
12213 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012215 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 if (!buf1)
12217 goto onError;
12218 buf2 = PyUnicode_DATA(sep_obj);
12219 if (kind2 != kind)
12220 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12221 if (!buf2)
12222 goto onError;
12223 len1 = PyUnicode_GET_LENGTH(str_obj);
12224 len2 = PyUnicode_GET_LENGTH(sep_obj);
12225
Victor Stinner14f8f022011-10-05 20:58:25 +020012226 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012228 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12229 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12230 else
12231 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 break;
12233 case PyUnicode_2BYTE_KIND:
12234 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12235 break;
12236 case PyUnicode_4BYTE_KIND:
12237 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12238 break;
12239 default:
12240 assert(0);
12241 out = 0;
12242 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012243
12244 Py_DECREF(sep_obj);
12245 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 if (kind1 != kind)
12247 PyMem_Free(buf1);
12248 if (kind2 != kind)
12249 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012250
12251 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 onError:
12253 Py_DECREF(sep_obj);
12254 Py_DECREF(str_obj);
12255 if (kind1 != kind && buf1)
12256 PyMem_Free(buf1);
12257 if (kind2 != kind && buf2)
12258 PyMem_Free(buf2);
12259 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260}
12261
12262
12263PyObject *
12264PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12265{
12266 PyObject* str_obj;
12267 PyObject* sep_obj;
12268 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 int kind1, kind2, kind;
12270 void *buf1 = NULL, *buf2 = NULL;
12271 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272
12273 str_obj = PyUnicode_FromObject(str_in);
12274 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012275 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276 sep_obj = PyUnicode_FromObject(sep_in);
12277 if (!sep_obj) {
12278 Py_DECREF(str_obj);
12279 return NULL;
12280 }
12281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 kind1 = PyUnicode_KIND(str_in);
12283 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012284 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 buf1 = PyUnicode_DATA(str_in);
12286 if (kind1 != kind)
12287 buf1 = _PyUnicode_AsKind(str_in, kind);
12288 if (!buf1)
12289 goto onError;
12290 buf2 = PyUnicode_DATA(sep_obj);
12291 if (kind2 != kind)
12292 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12293 if (!buf2)
12294 goto onError;
12295 len1 = PyUnicode_GET_LENGTH(str_obj);
12296 len2 = PyUnicode_GET_LENGTH(sep_obj);
12297
12298 switch(PyUnicode_KIND(str_in)) {
12299 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012300 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12301 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12302 else
12303 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 break;
12305 case PyUnicode_2BYTE_KIND:
12306 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12307 break;
12308 case PyUnicode_4BYTE_KIND:
12309 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12310 break;
12311 default:
12312 assert(0);
12313 out = 0;
12314 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012315
12316 Py_DECREF(sep_obj);
12317 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (kind1 != kind)
12319 PyMem_Free(buf1);
12320 if (kind2 != kind)
12321 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012322
12323 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 onError:
12325 Py_DECREF(sep_obj);
12326 Py_DECREF(str_obj);
12327 if (kind1 != kind && buf1)
12328 PyMem_Free(buf1);
12329 if (kind2 != kind && buf2)
12330 PyMem_Free(buf2);
12331 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012332}
12333
12334PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012336\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012337Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012338the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012339found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012340
12341static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012342unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343{
Victor Stinner9310abb2011-10-05 00:59:23 +020012344 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345}
12346
12347PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012348 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012349\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012350Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012352separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012353
12354static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012355unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012356{
Victor Stinner9310abb2011-10-05 00:59:23 +020012357 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012358}
12359
Alexander Belopolsky40018472011-02-26 01:02:56 +000012360PyObject *
12361PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012362{
12363 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012364
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012365 s = PyUnicode_FromObject(s);
12366 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012367 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 if (sep != NULL) {
12369 sep = PyUnicode_FromObject(sep);
12370 if (sep == NULL) {
12371 Py_DECREF(s);
12372 return NULL;
12373 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012374 }
12375
Victor Stinner9310abb2011-10-05 00:59:23 +020012376 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012377
12378 Py_DECREF(s);
12379 Py_XDECREF(sep);
12380 return result;
12381}
12382
12383PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012384 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012385\n\
12386Return a list of the words in S, using sep as the\n\
12387delimiter string, starting at the end of the string and\n\
12388working to the front. If maxsplit is given, at most maxsplit\n\
12389splits are done. If sep is not specified, any whitespace string\n\
12390is a separator.");
12391
12392static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012393unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012394{
12395 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012396 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012397
Martin v. Löwis18e16552006-02-15 17:27:45 +000012398 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399 return NULL;
12400
12401 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012403 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012404 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012405 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012406 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012407}
12408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012409PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012410 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411\n\
12412Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012413Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012414is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415
12416static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012417unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012419 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012420 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012422 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12423 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012424 return NULL;
12425
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012426 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427}
12428
12429static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012430PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012431{
Walter Dörwald346737f2007-05-31 10:44:43 +000012432 if (PyUnicode_CheckExact(self)) {
12433 Py_INCREF(self);
12434 return self;
12435 } else
12436 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012437 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438}
12439
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012440PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442\n\
12443Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012444and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
12446static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012447unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449 return fixup(self, fixswapcase);
12450}
12451
Georg Brandlceee0772007-11-27 23:48:05 +000012452PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012454\n\
12455Return a translation table usable for str.translate().\n\
12456If there is only one argument, it must be a dictionary mapping Unicode\n\
12457ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012458Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012459If there are two arguments, they must be strings of equal length, and\n\
12460in the resulting dictionary, each character in x will be mapped to the\n\
12461character at the same position in y. If there is a third argument, it\n\
12462must be a string, whose characters will be mapped to None in the result.");
12463
12464static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012465unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012466{
12467 PyObject *x, *y = NULL, *z = NULL;
12468 PyObject *new = NULL, *key, *value;
12469 Py_ssize_t i = 0;
12470 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012471
Georg Brandlceee0772007-11-27 23:48:05 +000012472 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12473 return NULL;
12474 new = PyDict_New();
12475 if (!new)
12476 return NULL;
12477 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 int x_kind, y_kind, z_kind;
12479 void *x_data, *y_data, *z_data;
12480
Georg Brandlceee0772007-11-27 23:48:05 +000012481 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012482 if (!PyUnicode_Check(x)) {
12483 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12484 "be a string if there is a second argument");
12485 goto err;
12486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012488 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12489 "arguments must have equal length");
12490 goto err;
12491 }
12492 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 x_kind = PyUnicode_KIND(x);
12494 y_kind = PyUnicode_KIND(y);
12495 x_data = PyUnicode_DATA(x);
12496 y_data = PyUnicode_DATA(y);
12497 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12498 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12499 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012500 if (!key || !value)
12501 goto err;
12502 res = PyDict_SetItem(new, key, value);
12503 Py_DECREF(key);
12504 Py_DECREF(value);
12505 if (res < 0)
12506 goto err;
12507 }
12508 /* create entries for deleting chars in z */
12509 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 z_kind = PyUnicode_KIND(z);
12511 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012512 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012514 if (!key)
12515 goto err;
12516 res = PyDict_SetItem(new, key, Py_None);
12517 Py_DECREF(key);
12518 if (res < 0)
12519 goto err;
12520 }
12521 }
12522 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 int kind;
12524 void *data;
12525
Georg Brandlceee0772007-11-27 23:48:05 +000012526 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012527 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012528 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12529 "to maketrans it must be a dict");
12530 goto err;
12531 }
12532 /* copy entries into the new dict, converting string keys to int keys */
12533 while (PyDict_Next(x, &i, &key, &value)) {
12534 if (PyUnicode_Check(key)) {
12535 /* convert string keys to integer keys */
12536 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012537 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012538 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12539 "table must be of length 1");
12540 goto err;
12541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012542 kind = PyUnicode_KIND(key);
12543 data = PyUnicode_DATA(key);
12544 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012545 if (!newkey)
12546 goto err;
12547 res = PyDict_SetItem(new, newkey, value);
12548 Py_DECREF(newkey);
12549 if (res < 0)
12550 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012551 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012552 /* just keep integer keys */
12553 if (PyDict_SetItem(new, key, value) < 0)
12554 goto err;
12555 } else {
12556 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12557 "be strings or integers");
12558 goto err;
12559 }
12560 }
12561 }
12562 return new;
12563 err:
12564 Py_DECREF(new);
12565 return NULL;
12566}
12567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012568PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570\n\
12571Return a copy of the string S, where all characters have been mapped\n\
12572through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012573Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012574Unmapped characters are left untouched. Characters mapped to None\n\
12575are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576
12577static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012583PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012586Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587
12588static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012589unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 return fixup(self, fixupper);
12592}
12593
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012594PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012597Pad a numeric string S with zeros on the left, to fill a field\n\
12598of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599
12600static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012601unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012603 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012604 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012605 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 int kind;
12607 void *data;
12608 Py_UCS4 chr;
12609
12610 if (PyUnicode_READY(self) == -1)
12611 return NULL;
12612
Martin v. Löwis18e16552006-02-15 17:27:45 +000012613 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614 return NULL;
12615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012617 if (PyUnicode_CheckExact(self)) {
12618 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012619 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012620 }
12621 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012622 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623 }
12624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627 u = pad(self, fill, 0, '0');
12628
Walter Dörwald068325e2002-04-15 13:36:47 +000012629 if (u == NULL)
12630 return NULL;
12631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 kind = PyUnicode_KIND(u);
12633 data = PyUnicode_DATA(u);
12634 chr = PyUnicode_READ(kind, data, fill);
12635
12636 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 PyUnicode_WRITE(kind, data, 0, chr);
12639 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 }
12641
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012642 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012643 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012645
12646#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012647static PyObject *
12648unicode__decimal2ascii(PyObject *self)
12649{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012651}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652#endif
12653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012654PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012657Return True if S starts with the specified prefix, False otherwise.\n\
12658With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012659With optional end, stop comparing S at that position.\n\
12660prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
12662static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012663unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012664 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012667 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012668 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012669 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012670 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
Jesus Ceaac451502011-04-20 17:09:23 +020012672 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012673 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012674 if (PyTuple_Check(subobj)) {
12675 Py_ssize_t i;
12676 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012677 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012678 if (substring == NULL)
12679 return NULL;
12680 result = tailmatch(self, substring, start, end, -1);
12681 Py_DECREF(substring);
12682 if (result) {
12683 Py_RETURN_TRUE;
12684 }
12685 }
12686 /* nothing matched */
12687 Py_RETURN_FALSE;
12688 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012689 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012690 if (substring == NULL) {
12691 if (PyErr_ExceptionMatches(PyExc_TypeError))
12692 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12693 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012695 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012696 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012698 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699}
12700
12701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012705Return True if S ends with the specified suffix, False otherwise.\n\
12706With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012707With optional end, stop comparing S at that position.\n\
12708suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
12710static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012711unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012712 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012714 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012715 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012716 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012717 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012718 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
Jesus Ceaac451502011-04-20 17:09:23 +020012720 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012721 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012722 if (PyTuple_Check(subobj)) {
12723 Py_ssize_t i;
12724 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012725 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012729 result = tailmatch(self, substring, start, end, +1);
12730 Py_DECREF(substring);
12731 if (result) {
12732 Py_RETURN_TRUE;
12733 }
12734 }
12735 Py_RETURN_FALSE;
12736 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012737 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012738 if (substring == NULL) {
12739 if (PyErr_ExceptionMatches(PyExc_TypeError))
12740 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12741 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012743 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012744 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012746 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747}
12748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012750
12751PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012753\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012754Return a formatted version of S, using substitutions from args and kwargs.\n\
12755The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012756
Eric Smith27bbca62010-11-04 17:06:58 +000012757PyDoc_STRVAR(format_map__doc__,
12758 "S.format_map(mapping) -> str\n\
12759\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012760Return a formatted version of S, using substitutions from mapping.\n\
12761The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012762
Eric Smith4a7d76d2008-05-30 18:10:19 +000012763static PyObject *
12764unicode__format__(PyObject* self, PyObject* args)
12765{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012766 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012767
12768 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12769 return NULL;
12770
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012771 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012773 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012774}
12775
Eric Smith8c663262007-08-25 02:26:07 +000012776PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012778\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012779Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012780
12781static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012782unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012783{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 Py_ssize_t size;
12785
12786 /* If it's a compact object, account for base structure +
12787 character data. */
12788 if (PyUnicode_IS_COMPACT_ASCII(v))
12789 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12790 else if (PyUnicode_IS_COMPACT(v))
12791 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012792 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 else {
12794 /* If it is a two-block object, account for base object, and
12795 for character block if present. */
12796 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012797 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012799 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 }
12801 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012802 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012803 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012805 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012806 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807
12808 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012809}
12810
12811PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012813
12814static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012815unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012816{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012817 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 if (!copy)
12819 return NULL;
12820 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012821}
12822
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823static PyMethodDef unicode_methods[] = {
12824
12825 /* Order is according to common usage: often used methods should
12826 appear first, since lookup is done sequentially. */
12827
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012828 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012829 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12830 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012831 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012832 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12833 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12834 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12835 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12836 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12837 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12838 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012840 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12841 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12842 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012843 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012844 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12845 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12846 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012847 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012848 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012849 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012850 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012851 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12852 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12853 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12854 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12855 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12856 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12857 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12858 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12859 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12860 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12861 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12862 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12863 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12864 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012865 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012866 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012867 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012868 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012869 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012870 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012871 {"maketrans", (PyCFunction) unicode_maketrans,
12872 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012873 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012874#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012875 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876#endif
12877
12878#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012879 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012880 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881#endif
12882
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884 {NULL, NULL}
12885};
12886
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012887static PyObject *
12888unicode_mod(PyObject *v, PyObject *w)
12889{
Brian Curtindfc80e32011-08-10 20:28:54 -050012890 if (!PyUnicode_Check(v))
12891 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012893}
12894
12895static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012896 0, /*nb_add*/
12897 0, /*nb_subtract*/
12898 0, /*nb_multiply*/
12899 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012900};
12901
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012903 (lenfunc) unicode_length, /* sq_length */
12904 PyUnicode_Concat, /* sq_concat */
12905 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12906 (ssizeargfunc) unicode_getitem, /* sq_item */
12907 0, /* sq_slice */
12908 0, /* sq_ass_item */
12909 0, /* sq_ass_slice */
12910 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911};
12912
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012913static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012914unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 if (PyUnicode_READY(self) == -1)
12917 return NULL;
12918
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012919 if (PyIndex_Check(item)) {
12920 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012921 if (i == -1 && PyErr_Occurred())
12922 return NULL;
12923 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012925 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012926 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012927 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012928 PyObject *result;
12929 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012930 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012931 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012935 return NULL;
12936 }
12937
12938 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 return PyUnicode_New(0, 0);
12940 } else if (start == 0 && step == 1 &&
12941 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012942 PyUnicode_CheckExact(self)) {
12943 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012944 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012945 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012946 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012947 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012948 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012949 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012950 src_kind = PyUnicode_KIND(self);
12951 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012952 if (!PyUnicode_IS_ASCII(self)) {
12953 kind_limit = kind_maxchar_limit(src_kind);
12954 max_char = 0;
12955 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12956 ch = PyUnicode_READ(src_kind, src_data, cur);
12957 if (ch > max_char) {
12958 max_char = ch;
12959 if (max_char >= kind_limit)
12960 break;
12961 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012962 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012963 }
Victor Stinner55c99112011-10-13 01:17:06 +020012964 else
12965 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012966 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012967 if (result == NULL)
12968 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012969 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012970 dest_data = PyUnicode_DATA(result);
12971
12972 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012973 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12974 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012975 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012976 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012977 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012978 } else {
12979 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12980 return NULL;
12981 }
12982}
12983
12984static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012985 (lenfunc)unicode_length, /* mp_length */
12986 (binaryfunc)unicode_subscript, /* mp_subscript */
12987 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012988};
12989
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991/* Helpers for PyUnicode_Format() */
12992
12993static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012994getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012996 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012998 (*p_argidx)++;
12999 if (arglen < 0)
13000 return args;
13001 else
13002 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013003 }
13004 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006 return NULL;
13007}
13008
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013009/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013011static PyObject *
13012formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013014 char *p;
13015 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013017
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018 x = PyFloat_AsDouble(v);
13019 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013020 return NULL;
13021
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013023 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013024
Eric Smith0923d1d2009-04-16 20:16:10 +000013025 p = PyOS_double_to_string(x, type, prec,
13026 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013027 if (p == NULL)
13028 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013030 PyMem_Free(p);
13031 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032}
13033
Tim Peters38fd5b62000-09-21 05:43:11 +000013034static PyObject*
13035formatlong(PyObject *val, int flags, int prec, int type)
13036{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013037 char *buf;
13038 int len;
13039 PyObject *str; /* temporary string object. */
13040 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013041
Benjamin Peterson14339b62009-01-31 16:36:08 +000013042 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13043 if (!str)
13044 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013046 Py_DECREF(str);
13047 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013048}
13049
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013050static Py_UCS4
13051formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013053 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013054 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013056 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013057 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 goto onError;
13059 }
13060 else {
13061 /* Integer input truncated to a character */
13062 long x;
13063 x = PyLong_AsLong(v);
13064 if (x == -1 && PyErr_Occurred())
13065 goto onError;
13066
Victor Stinner8faf8212011-12-08 22:14:11 +010013067 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 PyErr_SetString(PyExc_OverflowError,
13069 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013070 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013071 }
13072
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013073 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013074 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013075
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013077 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013079 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080}
13081
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013082static int
13083repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13084{
13085 int r;
13086 assert(count > 0);
13087 assert(PyUnicode_Check(obj));
13088 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013089 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013090 if (repeated == NULL)
13091 return -1;
13092 r = _PyAccu_Accumulate(acc, repeated);
13093 Py_DECREF(repeated);
13094 return r;
13095 }
13096 else {
13097 do {
13098 if (_PyAccu_Accumulate(acc, obj))
13099 return -1;
13100 } while (--count);
13101 return 0;
13102 }
13103}
13104
Alexander Belopolsky40018472011-02-26 01:02:56 +000013105PyObject *
13106PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 void *fmt;
13109 int fmtkind;
13110 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013112 int r;
13113 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013116 PyObject *temp = NULL;
13117 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013118 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013119 _PyAccu acc;
13120 static PyObject *plus, *minus, *blank, *zero, *percent;
13121
13122 if (!plus && !(plus = get_latin1_char('+')))
13123 return NULL;
13124 if (!minus && !(minus = get_latin1_char('-')))
13125 return NULL;
13126 if (!blank && !(blank = get_latin1_char(' ')))
13127 return NULL;
13128 if (!zero && !(zero = get_latin1_char('0')))
13129 return NULL;
13130 if (!percent && !(percent = get_latin1_char('%')))
13131 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013132
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 PyErr_BadInternalCall();
13135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013140 if (_PyAccu_Init(&acc))
13141 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 fmt = PyUnicode_DATA(uformat);
13143 fmtkind = PyUnicode_KIND(uformat);
13144 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13145 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 arglen = PyTuple_Size(args);
13149 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 }
13151 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 arglen = -1;
13153 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013155 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013156 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158
13159 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013161 PyObject *nonfmt;
13162 Py_ssize_t nonfmtpos;
13163 nonfmtpos = fmtpos++;
13164 while (fmtcnt >= 0 &&
13165 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13166 fmtpos++;
13167 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013169 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170 if (nonfmt == NULL)
13171 goto onError;
13172 r = _PyAccu_Accumulate(&acc, nonfmt);
13173 Py_DECREF(nonfmt);
13174 if (r)
13175 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 }
13177 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 /* Got a format specifier */
13179 int flags = 0;
13180 Py_ssize_t width = -1;
13181 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013182 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013183 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 int isnumok;
13185 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013186 void *pbuf = NULL;
13187 Py_ssize_t pindex, len;
13188 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 fmtpos++;
13191 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13192 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 Py_ssize_t keylen;
13194 PyObject *key;
13195 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013196
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 if (dict == NULL) {
13198 PyErr_SetString(PyExc_TypeError,
13199 "format requires a mapping");
13200 goto onError;
13201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 /* Skip over balanced parentheses */
13206 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013214 if (fmtcnt < 0 || pcount > 0) {
13215 PyErr_SetString(PyExc_ValueError,
13216 "incomplete format key");
13217 goto onError;
13218 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013219 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013220 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013221 if (key == NULL)
13222 goto onError;
13223 if (args_owned) {
13224 Py_DECREF(args);
13225 args_owned = 0;
13226 }
13227 args = PyObject_GetItem(dict, key);
13228 Py_DECREF(key);
13229 if (args == NULL) {
13230 goto onError;
13231 }
13232 args_owned = 1;
13233 arglen = -1;
13234 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013235 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 case '-': flags |= F_LJUST; continue;
13239 case '+': flags |= F_SIGN; continue;
13240 case ' ': flags |= F_BLANK; continue;
13241 case '#': flags |= F_ALT; continue;
13242 case '0': flags |= F_ZERO; continue;
13243 }
13244 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013245 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013246 if (c == '*') {
13247 v = getnextarg(args, arglen, &argidx);
13248 if (v == NULL)
13249 goto onError;
13250 if (!PyLong_Check(v)) {
13251 PyErr_SetString(PyExc_TypeError,
13252 "* wants int");
13253 goto onError;
13254 }
13255 width = PyLong_AsLong(v);
13256 if (width == -1 && PyErr_Occurred())
13257 goto onError;
13258 if (width < 0) {
13259 flags |= F_LJUST;
13260 width = -width;
13261 }
13262 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 }
13265 else if (c >= '0' && c <= '9') {
13266 width = c - '0';
13267 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 if (c < '0' || c > '9')
13270 break;
13271 if ((width*10) / 10 != width) {
13272 PyErr_SetString(PyExc_ValueError,
13273 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013274 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 }
13276 width = width*10 + (c - '0');
13277 }
13278 }
13279 if (c == '.') {
13280 prec = 0;
13281 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013283 if (c == '*') {
13284 v = getnextarg(args, arglen, &argidx);
13285 if (v == NULL)
13286 goto onError;
13287 if (!PyLong_Check(v)) {
13288 PyErr_SetString(PyExc_TypeError,
13289 "* wants int");
13290 goto onError;
13291 }
13292 prec = PyLong_AsLong(v);
13293 if (prec == -1 && PyErr_Occurred())
13294 goto onError;
13295 if (prec < 0)
13296 prec = 0;
13297 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 }
13300 else if (c >= '0' && c <= '9') {
13301 prec = c - '0';
13302 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013303 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 if (c < '0' || c > '9')
13305 break;
13306 if ((prec*10) / 10 != prec) {
13307 PyErr_SetString(PyExc_ValueError,
13308 "prec too big");
13309 goto onError;
13310 }
13311 prec = prec*10 + (c - '0');
13312 }
13313 }
13314 } /* prec */
13315 if (fmtcnt >= 0) {
13316 if (c == 'h' || c == 'l' || c == 'L') {
13317 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013319 }
13320 }
13321 if (fmtcnt < 0) {
13322 PyErr_SetString(PyExc_ValueError,
13323 "incomplete format");
13324 goto onError;
13325 }
13326 if (c != '%') {
13327 v = getnextarg(args, arglen, &argidx);
13328 if (v == NULL)
13329 goto onError;
13330 }
13331 sign = 0;
13332 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013333 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 switch (c) {
13335
13336 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013337 _PyAccu_Accumulate(&acc, percent);
13338 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013339
13340 case 's':
13341 case 'r':
13342 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013343 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013344 temp = v;
13345 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013346 }
13347 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013348 if (c == 's')
13349 temp = PyObject_Str(v);
13350 else if (c == 'r')
13351 temp = PyObject_Repr(v);
13352 else
13353 temp = PyObject_ASCII(v);
13354 if (temp == NULL)
13355 goto onError;
13356 if (PyUnicode_Check(temp))
13357 /* nothing to do */;
13358 else {
13359 Py_DECREF(temp);
13360 PyErr_SetString(PyExc_TypeError,
13361 "%s argument has non-string str()");
13362 goto onError;
13363 }
13364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365 if (PyUnicode_READY(temp) == -1) {
13366 Py_CLEAR(temp);
13367 goto onError;
13368 }
13369 pbuf = PyUnicode_DATA(temp);
13370 kind = PyUnicode_KIND(temp);
13371 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 if (prec >= 0 && len > prec)
13373 len = prec;
13374 break;
13375
13376 case 'i':
13377 case 'd':
13378 case 'u':
13379 case 'o':
13380 case 'x':
13381 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 isnumok = 0;
13383 if (PyNumber_Check(v)) {
13384 PyObject *iobj=NULL;
13385
13386 if (PyLong_Check(v)) {
13387 iobj = v;
13388 Py_INCREF(iobj);
13389 }
13390 else {
13391 iobj = PyNumber_Long(v);
13392 }
13393 if (iobj!=NULL) {
13394 if (PyLong_Check(iobj)) {
13395 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013396 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 Py_DECREF(iobj);
13398 if (!temp)
13399 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 if (PyUnicode_READY(temp) == -1) {
13401 Py_CLEAR(temp);
13402 goto onError;
13403 }
13404 pbuf = PyUnicode_DATA(temp);
13405 kind = PyUnicode_KIND(temp);
13406 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 sign = 1;
13408 }
13409 else {
13410 Py_DECREF(iobj);
13411 }
13412 }
13413 }
13414 if (!isnumok) {
13415 PyErr_Format(PyExc_TypeError,
13416 "%%%c format: a number is required, "
13417 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13418 goto onError;
13419 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013422 fillobj = zero;
13423 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 break;
13425
13426 case 'e':
13427 case 'E':
13428 case 'f':
13429 case 'F':
13430 case 'g':
13431 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013432 temp = formatfloat(v, flags, prec, c);
13433 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 if (PyUnicode_READY(temp) == -1) {
13436 Py_CLEAR(temp);
13437 goto onError;
13438 }
13439 pbuf = PyUnicode_DATA(temp);
13440 kind = PyUnicode_KIND(temp);
13441 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 fillobj = zero;
13446 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 break;
13448
13449 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013450 {
13451 Py_UCS4 ch = formatchar(v);
13452 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013454 temp = _PyUnicode_FromUCS4(&ch, 1);
13455 if (temp == NULL)
13456 goto onError;
13457 pbuf = PyUnicode_DATA(temp);
13458 kind = PyUnicode_KIND(temp);
13459 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013461 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013462
13463 default:
13464 PyErr_Format(PyExc_ValueError,
13465 "unsupported format character '%c' (0x%x) "
13466 "at index %zd",
13467 (31<=c && c<=126) ? (char)c : '?',
13468 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 goto onError;
13471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 /* pbuf is initialized here. */
13473 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13476 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013477 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013478 pindex++;
13479 }
13480 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13481 signobj = plus;
13482 len--;
13483 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013484 }
13485 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013486 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013487 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013488 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 else
13490 sign = 0;
13491 }
13492 if (width < len)
13493 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013495 if (fill != ' ') {
13496 assert(signobj != NULL);
13497 if (_PyAccu_Accumulate(&acc, signobj))
13498 goto onError;
13499 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 if (width > len)
13501 width--;
13502 }
13503 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013504 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013505 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013507 second = get_latin1_char(
13508 PyUnicode_READ(kind, pbuf, pindex + 1));
13509 pindex += 2;
13510 if (second == NULL ||
13511 _PyAccu_Accumulate(&acc, zero) ||
13512 _PyAccu_Accumulate(&acc, second))
13513 goto onError;
13514 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 width -= 2;
13517 if (width < 0)
13518 width = 0;
13519 len -= 2;
13520 }
13521 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013523 if (repeat_accumulate(&acc, fillobj, width - len))
13524 goto onError;
13525 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 }
13527 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013528 if (sign) {
13529 assert(signobj != NULL);
13530 if (_PyAccu_Accumulate(&acc, signobj))
13531 goto onError;
13532 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013534 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13535 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013536 second = get_latin1_char(
13537 PyUnicode_READ(kind, pbuf, pindex + 1));
13538 pindex += 2;
13539 if (second == NULL ||
13540 _PyAccu_Accumulate(&acc, zero) ||
13541 _PyAccu_Accumulate(&acc, second))
13542 goto onError;
13543 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013544 }
13545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013546 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013547 if (temp != NULL) {
13548 assert(pbuf == PyUnicode_DATA(temp));
13549 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013550 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 else {
13552 const char *p = (const char *) pbuf;
13553 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013554 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013555 v = PyUnicode_FromKindAndData(kind, p, len);
13556 }
13557 if (v == NULL)
13558 goto onError;
13559 r = _PyAccu_Accumulate(&acc, v);
13560 Py_DECREF(v);
13561 if (r)
13562 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013563 if (width > len && repeat_accumulate(&acc, blank, width - len))
13564 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 if (dict && (argidx < arglen) && c != '%') {
13566 PyErr_SetString(PyExc_TypeError,
13567 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 goto onError;
13569 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013572 } /* until end */
13573 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 PyErr_SetString(PyExc_TypeError,
13575 "not all arguments converted during string formatting");
13576 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577 }
13578
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013579 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013580 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582 }
13583 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013584 Py_XDECREF(temp);
13585 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013586 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013587
Benjamin Peterson29060642009-01-31 22:14:21 +000013588 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013589 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013590 Py_XDECREF(temp);
13591 Py_XDECREF(second);
13592 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013593 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013595 }
13596 return NULL;
13597}
13598
Jeremy Hylton938ace62002-07-17 16:30:39 +000013599static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013600unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13601
Tim Peters6d6c1a32001-08-02 04:15:00 +000013602static PyObject *
13603unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13604{
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 static char *kwlist[] = {"object", "encoding", "errors", 0};
13607 char *encoding = NULL;
13608 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013609
Benjamin Peterson14339b62009-01-31 16:36:08 +000013610 if (type != &PyUnicode_Type)
13611 return unicode_subtype_new(type, args, kwds);
13612 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013614 return NULL;
13615 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013616 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013617 if (encoding == NULL && errors == NULL)
13618 return PyObject_Str(x);
13619 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013621}
13622
Guido van Rossume023fe02001-08-30 03:12:59 +000013623static PyObject *
13624unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13625{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013626 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013627 Py_ssize_t length, char_size;
13628 int share_wstr, share_utf8;
13629 unsigned int kind;
13630 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013631
Benjamin Peterson14339b62009-01-31 16:36:08 +000013632 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013633
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013634 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013636 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013637 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013638 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013639 return NULL;
13640
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013641 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013642 if (self == NULL) {
13643 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 return NULL;
13645 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013646 kind = PyUnicode_KIND(unicode);
13647 length = PyUnicode_GET_LENGTH(unicode);
13648
13649 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013650#ifdef Py_DEBUG
13651 _PyUnicode_HASH(self) = -1;
13652#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013653 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013654#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013655 _PyUnicode_STATE(self).interned = 0;
13656 _PyUnicode_STATE(self).kind = kind;
13657 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013658 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013659 _PyUnicode_STATE(self).ready = 1;
13660 _PyUnicode_WSTR(self) = NULL;
13661 _PyUnicode_UTF8_LENGTH(self) = 0;
13662 _PyUnicode_UTF8(self) = NULL;
13663 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013664 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013665
13666 share_utf8 = 0;
13667 share_wstr = 0;
13668 if (kind == PyUnicode_1BYTE_KIND) {
13669 char_size = 1;
13670 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13671 share_utf8 = 1;
13672 }
13673 else if (kind == PyUnicode_2BYTE_KIND) {
13674 char_size = 2;
13675 if (sizeof(wchar_t) == 2)
13676 share_wstr = 1;
13677 }
13678 else {
13679 assert(kind == PyUnicode_4BYTE_KIND);
13680 char_size = 4;
13681 if (sizeof(wchar_t) == 4)
13682 share_wstr = 1;
13683 }
13684
13685 /* Ensure we won't overflow the length. */
13686 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13687 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013688 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013689 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013690 data = PyObject_MALLOC((length + 1) * char_size);
13691 if (data == NULL) {
13692 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693 goto onError;
13694 }
13695
Victor Stinnerc3c74152011-10-02 20:39:55 +020013696 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013697 if (share_utf8) {
13698 _PyUnicode_UTF8_LENGTH(self) = length;
13699 _PyUnicode_UTF8(self) = data;
13700 }
13701 if (share_wstr) {
13702 _PyUnicode_WSTR_LENGTH(self) = length;
13703 _PyUnicode_WSTR(self) = (wchar_t *)data;
13704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013705
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013706 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013707 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013708 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013709#ifdef Py_DEBUG
13710 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13711#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013712 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013713 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013714
13715onError:
13716 Py_DECREF(unicode);
13717 Py_DECREF(self);
13718 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013719}
13720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013721PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013723\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013724Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013725encoding defaults to the current default string encoding.\n\
13726errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013727
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013728static PyObject *unicode_iter(PyObject *seq);
13729
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013731 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 "str", /* tp_name */
13733 sizeof(PyUnicodeObject), /* tp_size */
13734 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013735 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013736 (destructor)unicode_dealloc, /* tp_dealloc */
13737 0, /* tp_print */
13738 0, /* tp_getattr */
13739 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013740 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 unicode_repr, /* tp_repr */
13742 &unicode_as_number, /* tp_as_number */
13743 &unicode_as_sequence, /* tp_as_sequence */
13744 &unicode_as_mapping, /* tp_as_mapping */
13745 (hashfunc) unicode_hash, /* tp_hash*/
13746 0, /* tp_call*/
13747 (reprfunc) unicode_str, /* tp_str */
13748 PyObject_GenericGetAttr, /* tp_getattro */
13749 0, /* tp_setattro */
13750 0, /* tp_as_buffer */
13751 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013753 unicode_doc, /* tp_doc */
13754 0, /* tp_traverse */
13755 0, /* tp_clear */
13756 PyUnicode_RichCompare, /* tp_richcompare */
13757 0, /* tp_weaklistoffset */
13758 unicode_iter, /* tp_iter */
13759 0, /* tp_iternext */
13760 unicode_methods, /* tp_methods */
13761 0, /* tp_members */
13762 0, /* tp_getset */
13763 &PyBaseObject_Type, /* tp_base */
13764 0, /* tp_dict */
13765 0, /* tp_descr_get */
13766 0, /* tp_descr_set */
13767 0, /* tp_dictoffset */
13768 0, /* tp_init */
13769 0, /* tp_alloc */
13770 unicode_new, /* tp_new */
13771 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772};
13773
13774/* Initialize the Unicode implementation */
13775
Victor Stinner3a50e702011-10-18 21:21:00 +020013776int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013778 int i;
13779
Thomas Wouters477c8d52006-05-27 19:21:47 +000013780 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013782 0x000A, /* LINE FEED */
13783 0x000D, /* CARRIAGE RETURN */
13784 0x001C, /* FILE SEPARATOR */
13785 0x001D, /* GROUP SEPARATOR */
13786 0x001E, /* RECORD SEPARATOR */
13787 0x0085, /* NEXT LINE */
13788 0x2028, /* LINE SEPARATOR */
13789 0x2029, /* PARAGRAPH SEPARATOR */
13790 };
13791
Fred Drakee4315f52000-05-09 19:53:39 +000013792 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013793 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013794 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013795 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013796 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013797
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013798 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013800 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013802
13803 /* initialize the linebreak bloom filter */
13804 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013806 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013807
13808 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013809
13810#ifdef HAVE_MBCS
13811 winver.dwOSVersionInfoSize = sizeof(winver);
13812 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13813 PyErr_SetFromWindowsErr(0);
13814 return -1;
13815 }
13816#endif
13817 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013818}
13819
13820/* Finalize the Unicode implementation */
13821
Christian Heimesa156e092008-02-16 07:38:31 +000013822int
13823PyUnicode_ClearFreeList(void)
13824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013826}
13827
Guido van Rossumd57fd912000-03-10 22:53:23 +000013828void
Thomas Wouters78890102000-07-22 19:25:51 +000013829_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013831 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013833 Py_XDECREF(unicode_empty);
13834 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013835
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013836 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013837 if (unicode_latin1[i]) {
13838 Py_DECREF(unicode_latin1[i]);
13839 unicode_latin1[i] = NULL;
13840 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013841 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013842 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013843 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013844}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013845
Walter Dörwald16807132007-05-25 13:52:07 +000013846void
13847PyUnicode_InternInPlace(PyObject **p)
13848{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013849 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013850 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013851#ifdef Py_DEBUG
13852 assert(s != NULL);
13853 assert(_PyUnicode_CHECK(s));
13854#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013855 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013856 return;
13857#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013858 /* If it's a subclass, we don't really know what putting
13859 it in the interned dict might do. */
13860 if (!PyUnicode_CheckExact(s))
13861 return;
13862 if (PyUnicode_CHECK_INTERNED(s))
13863 return;
13864 if (interned == NULL) {
13865 interned = PyDict_New();
13866 if (interned == NULL) {
13867 PyErr_Clear(); /* Don't leave an exception */
13868 return;
13869 }
13870 }
13871 /* It might be that the GetItem call fails even
13872 though the key is present in the dictionary,
13873 namely when this happens during a stack overflow. */
13874 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013875 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013877
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 if (t) {
13879 Py_INCREF(t);
13880 Py_DECREF(*p);
13881 *p = t;
13882 return;
13883 }
Walter Dörwald16807132007-05-25 13:52:07 +000013884
Benjamin Peterson14339b62009-01-31 16:36:08 +000013885 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013886 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013887 PyErr_Clear();
13888 PyThreadState_GET()->recursion_critical = 0;
13889 return;
13890 }
13891 PyThreadState_GET()->recursion_critical = 0;
13892 /* The two references in interned are not counted by refcnt.
13893 The deallocator will take care of this */
13894 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013896}
13897
13898void
13899PyUnicode_InternImmortal(PyObject **p)
13900{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013901 PyUnicode_InternInPlace(p);
13902 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013903 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 Py_INCREF(*p);
13905 }
Walter Dörwald16807132007-05-25 13:52:07 +000013906}
13907
13908PyObject *
13909PyUnicode_InternFromString(const char *cp)
13910{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013911 PyObject *s = PyUnicode_FromString(cp);
13912 if (s == NULL)
13913 return NULL;
13914 PyUnicode_InternInPlace(&s);
13915 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013916}
13917
Alexander Belopolsky40018472011-02-26 01:02:56 +000013918void
13919_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013920{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013921 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013922 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 Py_ssize_t i, n;
13924 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013925
Benjamin Peterson14339b62009-01-31 16:36:08 +000013926 if (interned == NULL || !PyDict_Check(interned))
13927 return;
13928 keys = PyDict_Keys(interned);
13929 if (keys == NULL || !PyList_Check(keys)) {
13930 PyErr_Clear();
13931 return;
13932 }
Walter Dörwald16807132007-05-25 13:52:07 +000013933
Benjamin Peterson14339b62009-01-31 16:36:08 +000013934 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13935 detector, interned unicode strings are not forcibly deallocated;
13936 rather, we give them their stolen references back, and then clear
13937 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013938
Benjamin Peterson14339b62009-01-31 16:36:08 +000013939 n = PyList_GET_SIZE(keys);
13940 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013941 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013942 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013943 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013944 if (PyUnicode_READY(s) == -1) {
13945 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013948 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013949 case SSTATE_NOT_INTERNED:
13950 /* XXX Shouldn't happen */
13951 break;
13952 case SSTATE_INTERNED_IMMORTAL:
13953 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 break;
13956 case SSTATE_INTERNED_MORTAL:
13957 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013958 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013959 break;
13960 default:
13961 Py_FatalError("Inconsistent interned string state.");
13962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013964 }
13965 fprintf(stderr, "total size of all interned strings: "
13966 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13967 "mortal/immortal\n", mortal_size, immortal_size);
13968 Py_DECREF(keys);
13969 PyDict_Clear(interned);
13970 Py_DECREF(interned);
13971 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013972}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013973
13974
13975/********************* Unicode Iterator **************************/
13976
13977typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 PyObject_HEAD
13979 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013980 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013981} unicodeiterobject;
13982
13983static void
13984unicodeiter_dealloc(unicodeiterobject *it)
13985{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 _PyObject_GC_UNTRACK(it);
13987 Py_XDECREF(it->it_seq);
13988 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013989}
13990
13991static int
13992unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13993{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 Py_VISIT(it->it_seq);
13995 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013996}
13997
13998static PyObject *
13999unicodeiter_next(unicodeiterobject *it)
14000{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014001 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014002
Benjamin Peterson14339b62009-01-31 16:36:08 +000014003 assert(it != NULL);
14004 seq = it->it_seq;
14005 if (seq == NULL)
14006 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014007 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014009 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14010 int kind = PyUnicode_KIND(seq);
14011 void *data = PyUnicode_DATA(seq);
14012 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14013 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 if (item != NULL)
14015 ++it->it_index;
14016 return item;
14017 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014018
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 Py_DECREF(seq);
14020 it->it_seq = NULL;
14021 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014022}
14023
14024static PyObject *
14025unicodeiter_len(unicodeiterobject *it)
14026{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 Py_ssize_t len = 0;
14028 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014029 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014030 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014031}
14032
14033PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14034
14035static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014036 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014037 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014039};
14040
14041PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14043 "str_iterator", /* tp_name */
14044 sizeof(unicodeiterobject), /* tp_basicsize */
14045 0, /* tp_itemsize */
14046 /* methods */
14047 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14048 0, /* tp_print */
14049 0, /* tp_getattr */
14050 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014051 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014052 0, /* tp_repr */
14053 0, /* tp_as_number */
14054 0, /* tp_as_sequence */
14055 0, /* tp_as_mapping */
14056 0, /* tp_hash */
14057 0, /* tp_call */
14058 0, /* tp_str */
14059 PyObject_GenericGetAttr, /* tp_getattro */
14060 0, /* tp_setattro */
14061 0, /* tp_as_buffer */
14062 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14063 0, /* tp_doc */
14064 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14065 0, /* tp_clear */
14066 0, /* tp_richcompare */
14067 0, /* tp_weaklistoffset */
14068 PyObject_SelfIter, /* tp_iter */
14069 (iternextfunc)unicodeiter_next, /* tp_iternext */
14070 unicodeiter_methods, /* tp_methods */
14071 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014072};
14073
14074static PyObject *
14075unicode_iter(PyObject *seq)
14076{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014077 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014078
Benjamin Peterson14339b62009-01-31 16:36:08 +000014079 if (!PyUnicode_Check(seq)) {
14080 PyErr_BadInternalCall();
14081 return NULL;
14082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014083 if (PyUnicode_READY(seq) == -1)
14084 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014085 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14086 if (it == NULL)
14087 return NULL;
14088 it->it_index = 0;
14089 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014090 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 _PyObject_GC_TRACK(it);
14092 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014093}
14094
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014095
14096size_t
14097Py_UNICODE_strlen(const Py_UNICODE *u)
14098{
14099 int res = 0;
14100 while(*u++)
14101 res++;
14102 return res;
14103}
14104
14105Py_UNICODE*
14106Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14107{
14108 Py_UNICODE *u = s1;
14109 while ((*u++ = *s2++));
14110 return s1;
14111}
14112
14113Py_UNICODE*
14114Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14115{
14116 Py_UNICODE *u = s1;
14117 while ((*u++ = *s2++))
14118 if (n-- == 0)
14119 break;
14120 return s1;
14121}
14122
14123Py_UNICODE*
14124Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14125{
14126 Py_UNICODE *u1 = s1;
14127 u1 += Py_UNICODE_strlen(u1);
14128 Py_UNICODE_strcpy(u1, s2);
14129 return s1;
14130}
14131
14132int
14133Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14134{
14135 while (*s1 && *s2 && *s1 == *s2)
14136 s1++, s2++;
14137 if (*s1 && *s2)
14138 return (*s1 < *s2) ? -1 : +1;
14139 if (*s1)
14140 return 1;
14141 if (*s2)
14142 return -1;
14143 return 0;
14144}
14145
14146int
14147Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14148{
14149 register Py_UNICODE u1, u2;
14150 for (; n != 0; n--) {
14151 u1 = *s1;
14152 u2 = *s2;
14153 if (u1 != u2)
14154 return (u1 < u2) ? -1 : +1;
14155 if (u1 == '\0')
14156 return 0;
14157 s1++;
14158 s2++;
14159 }
14160 return 0;
14161}
14162
14163Py_UNICODE*
14164Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14165{
14166 const Py_UNICODE *p;
14167 for (p = s; *p; p++)
14168 if (*p == c)
14169 return (Py_UNICODE*)p;
14170 return NULL;
14171}
14172
14173Py_UNICODE*
14174Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14175{
14176 const Py_UNICODE *p;
14177 p = s + Py_UNICODE_strlen(s);
14178 while (p != s) {
14179 p--;
14180 if (*p == c)
14181 return (Py_UNICODE*)p;
14182 }
14183 return NULL;
14184}
Victor Stinner331ea922010-08-10 16:37:20 +000014185
Victor Stinner71133ff2010-09-01 23:43:53 +000014186Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014187PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014188{
Victor Stinner577db2c2011-10-11 22:12:48 +020014189 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014190 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 if (!PyUnicode_Check(unicode)) {
14193 PyErr_BadArgument();
14194 return NULL;
14195 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014196 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014197 if (u == NULL)
14198 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014199 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014200 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014201 PyErr_NoMemory();
14202 return NULL;
14203 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014204 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014205 size *= sizeof(Py_UNICODE);
14206 copy = PyMem_Malloc(size);
14207 if (copy == NULL) {
14208 PyErr_NoMemory();
14209 return NULL;
14210 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014211 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014212 return copy;
14213}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014214
Georg Brandl66c221e2010-10-14 07:04:07 +000014215/* A _string module, to export formatter_parser and formatter_field_name_split
14216 to the string.Formatter class implemented in Python. */
14217
14218static PyMethodDef _string_methods[] = {
14219 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14220 METH_O, PyDoc_STR("split the argument as a field name")},
14221 {"formatter_parser", (PyCFunction) formatter_parser,
14222 METH_O, PyDoc_STR("parse the argument as a format string")},
14223 {NULL, NULL}
14224};
14225
14226static struct PyModuleDef _string_module = {
14227 PyModuleDef_HEAD_INIT,
14228 "_string",
14229 PyDoc_STR("string helper module"),
14230 0,
14231 _string_methods,
14232 NULL,
14233 NULL,
14234 NULL,
14235 NULL
14236};
14237
14238PyMODINIT_FUNC
14239PyInit__string(void)
14240{
14241 return PyModule_Create(&_string_module);
14242}
14243
14244
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014245#ifdef __cplusplus
14246}
14247#endif