blob: 7444c8b4ba0c120b8d9024a04cfaeb3298f689e7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100500 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
658 PyErr_NoMemory();
659 return NULL;
660 }
661 new_size = (struct_size + (length + 1) * char_size);
662
Victor Stinner84def372011-12-11 20:04:56 +0100663 _Py_DEC_REFTOTAL;
664 _Py_ForgetReference(unicode);
665
666 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
667 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100668 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyErr_NoMemory();
670 return NULL;
671 }
Victor Stinner84def372011-12-11 20:04:56 +0100672 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100674
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200676 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100678 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 _PyUnicode_WSTR_LENGTH(unicode) = length;
680 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
682 length, 0);
683 return unicode;
684}
685
Alexander Belopolsky40018472011-02-26 01:02:56 +0000686static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688{
Victor Stinner95663112011-10-04 01:03:50 +0200689 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100690 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200696 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 void *data;
698
699 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200700 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200701 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
702 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703
704 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
705 PyErr_NoMemory();
706 return -1;
707 }
708 new_size = (length + 1) * char_size;
709
Victor Stinner7a9105a2011-12-12 00:13:42 +0100710 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
711 {
712 PyObject_DEL(_PyUnicode_UTF8(unicode));
713 _PyUnicode_UTF8(unicode) = NULL;
714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
715 }
716
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 data = (PyObject *)PyObject_REALLOC(data, new_size);
718 if (data == NULL) {
719 PyErr_NoMemory();
720 return -1;
721 }
722 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200723 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 _PyUnicode_WSTR_LENGTH(unicode) = length;
726 }
727 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 _PyUnicode_UTF8_LENGTH(unicode) = length;
730 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 _PyUnicode_LENGTH(unicode) = length;
732 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200733 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200734 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinner95663112011-10-04 01:03:50 +0200738 assert(_PyUnicode_WSTR(unicode) != NULL);
739
740 /* check for integer overflow */
741 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
742 PyErr_NoMemory();
743 return -1;
744 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200746 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100747 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100763 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765
766 if (PyUnicode_READY(unicode) < 0)
767 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 if (w == NULL)
782 return NULL;
783 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
784 copy_length = Py_MIN(copy_length, length);
785 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
786 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200787 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 }
789}
790
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000792 Ux0000 terminated; some code (e.g. new_identifier)
793 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000796 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798*/
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200801static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802#endif
803
Alexander Belopolsky40018472011-02-26 01:02:56 +0000804static PyUnicodeObject *
805_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806{
807 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Thomas Wouters477c8d52006-05-27 19:21:47 +0000810 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (length == 0 && unicode_empty != NULL) {
812 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200813 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000816 /* Ensure we won't overflow the size. */
817 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
818 return (PyUnicodeObject *)PyErr_NoMemory();
819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 if (length < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to _PyUnicode_New");
823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#ifdef Py_DEBUG
827 ++unicode_old_new_calls;
828#endif
829
830 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
831 if (unicode == NULL)
832 return NULL;
833 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
834 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
835 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100838 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
863}
864
Victor Stinnerf42dc442011-10-02 23:33:16 +0200865static const char*
866unicode_kind_name(PyObject *unicode)
867{
Victor Stinner42dfd712011-10-03 14:41:45 +0200868 /* don't check consistency: unicode_kind_name() is called from
869 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870 if (!PyUnicode_IS_COMPACT(unicode))
871 {
872 if (!PyUnicode_IS_READY(unicode))
873 return "wstr";
874 switch(PyUnicode_KIND(unicode))
875 {
876 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200878 return "legacy ascii";
879 else
880 return "legacy latin1";
881 case PyUnicode_2BYTE_KIND:
882 return "legacy UCS2";
883 case PyUnicode_4BYTE_KIND:
884 return "legacy UCS4";
885 default:
886 return "<legacy invalid kind>";
887 }
888 }
889 assert(PyUnicode_IS_READY(unicode));
890 switch(PyUnicode_KIND(unicode))
891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
1794 switch(kind) {
1795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 switch(kind) {
1896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962 if (PyUnicode_READY(unicode))
1963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
1988 if (PyUnicode_READY(s))
1989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
1997 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 if (PyUnicode_READY(str_obj)) {
2475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the str and switch to the next slot */
2497 *callresult++ = str;
2498 break;
2499 }
2500 case 'R':
2501 {
2502 PyObject *obj = va_arg(count, PyObject *);
2503 PyObject *repr;
2504 assert(obj);
2505 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002509 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* Remember the repr and switch to the next slot */
2512 *callresult++ = repr;
2513 break;
2514 }
2515 case 'A':
2516 {
2517 PyObject *obj = va_arg(count, PyObject *);
2518 PyObject *ascii;
2519 assert(obj);
2520 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* Remember the repr and switch to the next slot */
2527 *callresult++ = ascii;
2528 break;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 default:
2531 /* if we stumble upon an unknown
2532 formatting code, copy the rest of
2533 the format string to the output
2534 string. (we cannot just skip the
2535 code, since there's no way to know
2536 what's in the argument list) */
2537 n += strlen(p);
2538 goto expand;
2539 }
2540 } else
2541 n++;
2542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 we don't have to resize the string.
2547 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002548 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 if (!string)
2550 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 kind = PyUnicode_KIND(string);
2552 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002558 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002559
2560 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2562 /* checking for == because the last argument could be a empty
2563 string, which causes i to point to end, the assert at the end of
2564 the loop */
2565 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 switch (*f) {
2568 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002569 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 const int ordinal = va_arg(vargs, int);
2571 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002573 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002574 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 case 'p':
2579 /* unused, since we already have the result */
2580 if (*f == 'p')
2581 (void) va_arg(vargs, void *);
2582 else
2583 (void) va_arg(vargs, int);
2584 /* extract the result from numberresults and append. */
2585 for (; *numberresult; ++i, ++numberresult)
2586 PyUnicode_WRITE(kind, data, i, *numberresult);
2587 /* skip over the separating '\0' */
2588 assert(*numberresult == '\0');
2589 numberresult++;
2590 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 case 's':
2593 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002594 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 size = PyUnicode_GET_LENGTH(*callresult);
2598 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002599 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* We're done with the unicode()/repr() => forget it */
2602 Py_DECREF(*callresult);
2603 /* switch to next unicode()/repr() result */
2604 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'U':
2608 {
2609 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t size;
2611 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2612 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 size = PyUnicode_GET_LENGTH(obj);
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(*callresult);
2629 assert(PyUnicode_KIND(*callresult) <=
2630 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
2637 }
2638 case 'S':
2639 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002640 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 /* unused, since we already have the result */
2644 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 copy_characters(string, i, *callresult, 0, size);
2647 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* We're done with the unicode()/repr() => forget it */
2649 Py_DECREF(*callresult);
2650 /* switch to next unicode()/repr() result */
2651 ++callresult;
2652 break;
2653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 break;
2657 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (; *p; ++p, ++i)
2659 PyUnicode_WRITE(kind, data, i, *p);
2660 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 goto end;
2662 }
Victor Stinner1205f272010-09-11 00:54:47 +00002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 else {
2665 assert(i < PyUnicode_GET_LENGTH(string));
2666 PyUnicode_WRITE(kind, data, i++, *f);
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults)
2673 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (numberresults)
2675 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002676 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults) {
2679 PyObject **callresult2 = callresults;
2680 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002681 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 ++callresult2;
2683 }
2684 PyObject_Free(callresults);
2685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689}
2690
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691PyObject *
2692PyUnicode_FromFormat(const char *format, ...)
2693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 PyObject* ret;
2695 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
2697#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 ret = PyUnicode_FromFormatV(format, vargs);
2703 va_end(vargs);
2704 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707#ifdef HAVE_WCHAR_H
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2710 convert a Unicode object to a wide character string.
2711
Victor Stinnerd88d9832011-09-06 02:00:05 +02002712 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 character) required to convert the unicode object. Ignore size argument.
2714
Victor Stinnerd88d9832011-09-06 02:00:05 +02002715 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
2722{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 const wchar_t *wstr;
2725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 if (wstr == NULL)
2728 return -1;
2729
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size > res)
2732 size = res + 1;
2733 else
2734 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 return res;
2737 }
2738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002740}
2741
2742Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002743PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002744 wchar_t *w,
2745 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
2747 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyErr_BadInternalCall();
2749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Victor Stinner137c34c2010-09-29 10:25:54 +00002754wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002755PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 Py_ssize_t *size)
2757{
2758 wchar_t* buffer;
2759 Py_ssize_t buflen;
2760
2761 if (unicode == NULL) {
2762 PyErr_BadInternalCall();
2763 return NULL;
2764 }
2765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002766 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (buflen == -1)
2768 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 PyErr_NoMemory();
2771 return NULL;
2772 }
2773
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2775 if (buffer == NULL) {
2776 PyErr_NoMemory();
2777 return NULL;
2778 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size != NULL)
2783 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 return buffer;
2785}
2786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Alexander Belopolsky40018472011-02-26 01:02:56 +00002789PyObject *
2790PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002793 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_ValueError,
2795 "chr() arg not in range(0x110000)");
2796 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 if (ordinal < 256)
2800 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 v = PyUnicode_New(1, ordinal);
2803 if (v == NULL)
2804 return NULL;
2805 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002806 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002815 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002816 if (PyUnicode_READY(obj))
2817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_INCREF(obj);
2819 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 }
2821 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 /* For a Unicode subtype that's not a Unicode object,
2823 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002824 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826 PyErr_Format(PyExc_TypeError,
2827 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002828 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 const char *encoding,
2835 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_BadInternalCall();
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 /* Decoding bytes objects is the most common case and should be fast */
2846 if (PyBytes_Check(obj)) {
2847 if (PyBytes_GET_SIZE(obj) == 0) {
2848 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002849 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 }
2851 else {
2852 v = PyUnicode_Decode(
2853 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2854 encoding, errors);
2855 }
2856 return v;
2857 }
2858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_SetString(PyExc_TypeError,
2861 "decoding str is not supported");
2862 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002865 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2866 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2867 PyErr_Format(PyExc_TypeError,
2868 "coercing to str: need bytes, bytearray "
2869 "or buffer-like object, %.80s found",
2870 Py_TYPE(obj)->tp_name);
2871 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002872 }
Tim Petersced69f82003-09-16 20:30:58 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002876 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Victor Stinner600d3be2010-06-10 12:00:55 +00002885/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002886 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2887 1 on success. */
2888static int
2889normalize_encoding(const char *encoding,
2890 char *lower,
2891 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002893 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 char *l;
2895 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002897 if (encoding == NULL) {
2898 strcpy(lower, "utf-8");
2899 return 1;
2900 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 e = encoding;
2902 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002904 while (*e) {
2905 if (l == l_end)
2906 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002907 if (Py_ISUPPER(*e)) {
2908 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 }
2910 else if (*e == '_') {
2911 *l++ = '-';
2912 e++;
2913 }
2914 else {
2915 *l++ = *e++;
2916 }
2917 }
2918 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 Py_ssize_t size,
2925 const char *encoding,
2926 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002927{
2928 PyObject *buffer = NULL, *unicode;
2929 Py_buffer info;
2930 char lower[11]; /* Enough for any encoding shortcut */
2931
Fred Drakee4315f52000-05-09 19:53:39 +00002932 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002933 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002934 if ((strcmp(lower, "utf-8") == 0) ||
2935 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002936 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002937 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002938 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002939 (strcmp(lower, "iso-8859-1") == 0))
2940 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002941#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002942 else if (strcmp(lower, "mbcs") == 0)
2943 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002944#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if (strcmp(lower, "ascii") == 0)
2946 return PyUnicode_DecodeASCII(s, size, errors);
2947 else if (strcmp(lower, "utf-16") == 0)
2948 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2949 else if (strcmp(lower, "utf-32") == 0)
2950 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
2953 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002954 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002955 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002956 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002957 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 if (buffer == NULL)
2959 goto onError;
2960 unicode = PyCodec_Decode(buffer, encoding, errors);
2961 if (unicode == NULL)
2962 goto onError;
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002965 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002966 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_DECREF(unicode);
2968 goto onError;
2969 }
2970 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002971 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002972
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_XDECREF(buffer);
2975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 const char *encoding,
2981 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982{
2983 PyObject *v;
2984
2985 if (!PyUnicode_Check(unicode)) {
2986 PyErr_BadArgument();
2987 goto onError;
2988 }
2989
2990 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
2993 /* Decode via the codec registry */
2994 v = PyCodec_Decode(unicode, encoding, errors);
2995 if (v == NULL)
2996 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002997 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000 return NULL;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
3018 /* Decode via the codec registry */
3019 v = PyCodec_Decode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 if (!PyUnicode_Check(v)) {
3023 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003024 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025 Py_TYPE(v)->tp_name);
3026 Py_DECREF(v);
3027 goto onError;
3028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003029 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 return NULL;
3033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
3036PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003037 Py_ssize_t size,
3038 const char *encoding,
3039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
3041 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 unicode = PyUnicode_FromUnicode(s, size);
3044 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3047 Py_DECREF(unicode);
3048 return v;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003055{
3056 PyObject *v;
3057
3058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
3060 goto onError;
3061 }
3062
3063 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
3066 /* Encode via the codec registry */
3067 v = PyCodec_Encode(unicode, encoding, errors);
3068 if (v == NULL)
3069 goto onError;
3070 return v;
3071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073 return NULL;
3074}
3075
Victor Stinnerad158722010-10-27 00:25:46 +00003076PyObject *
3077PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003078{
Victor Stinner99b95382011-07-04 14:23:54 +02003079#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003080 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003081#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003082 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003083#else
Victor Stinner793b5312011-04-27 00:24:21 +02003084 PyInterpreterState *interp = PyThreadState_GET()->interp;
3085 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3086 cannot use it to encode and decode filenames before it is loaded. Load
3087 the Python codec requires to encode at least its own filename. Use the C
3088 version of the locale codec until the codec registry is initialized and
3089 the Python codec is loaded.
3090
3091 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3092 cannot only rely on it: check also interp->fscodec_initialized for
3093 subinterpreters. */
3094 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095 return PyUnicode_AsEncodedString(unicode,
3096 Py_FileSystemDefaultEncoding,
3097 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003098 }
3099 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003100 /* locale encoding with surrogateescape */
3101 wchar_t *wchar;
3102 char *bytes;
3103 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003104 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105
3106 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3107 if (wchar == NULL)
3108 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003109 bytes = _Py_wchar2char(wchar, &error_pos);
3110 if (bytes == NULL) {
3111 if (error_pos != (size_t)-1) {
3112 char *errmsg = strerror(errno);
3113 PyObject *exc = NULL;
3114 if (errmsg == NULL)
3115 errmsg = "Py_wchar2char() failed";
3116 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003117 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 error_pos, error_pos+1,
3119 errmsg);
3120 Py_XDECREF(exc);
3121 }
3122 else
3123 PyErr_NoMemory();
3124 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003125 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003126 }
3127 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003128
3129 bytes_obj = PyBytes_FromString(bytes);
3130 PyMem_Free(bytes);
3131 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003132 }
Victor Stinnerad158722010-10-27 00:25:46 +00003133#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003134}
3135
Alexander Belopolsky40018472011-02-26 01:02:56 +00003136PyObject *
3137PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003138 const char *encoding,
3139 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140{
3141 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003142 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003143
Guido van Rossumd57fd912000-03-10 22:53:23 +00003144 if (!PyUnicode_Check(unicode)) {
3145 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 }
Fred Drakee4315f52000-05-09 19:53:39 +00003148
Fred Drakee4315f52000-05-09 19:53:39 +00003149 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003150 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003151 if ((strcmp(lower, "utf-8") == 0) ||
3152 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003153 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003154 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003155 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003156 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003157 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003158 }
Victor Stinner37296e82010-06-10 13:36:23 +00003159 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003160 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003161 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003163#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003164 else if (strcmp(lower, "mbcs") == 0)
3165 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003166#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003167 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 /* Encode via the codec registry */
3172 v = PyCodec_Encode(unicode, encoding, errors);
3173 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003174 return NULL;
3175
3176 /* The normal path */
3177 if (PyBytes_Check(v))
3178 return v;
3179
3180 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003182 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003183 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003184
3185 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3186 "encoder %s returned bytearray instead of bytes",
3187 encoding);
3188 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003189 Py_DECREF(v);
3190 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003193 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3194 Py_DECREF(v);
3195 return b;
3196 }
3197
3198 PyErr_Format(PyExc_TypeError,
3199 "encoder did not return a bytes object (type=%.400s)",
3200 Py_TYPE(v)->tp_name);
3201 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003202 return NULL;
3203}
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205PyObject *
3206PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003207 const char *encoding,
3208 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003209{
3210 PyObject *v;
3211
3212 if (!PyUnicode_Check(unicode)) {
3213 PyErr_BadArgument();
3214 goto onError;
3215 }
3216
3217 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003219
3220 /* Encode via the codec registry */
3221 v = PyCodec_Encode(unicode, encoding, errors);
3222 if (v == NULL)
3223 goto onError;
3224 if (!PyUnicode_Check(v)) {
3225 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003226 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003227 Py_TYPE(v)->tp_name);
3228 Py_DECREF(v);
3229 goto onError;
3230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return NULL;
3235}
3236
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003237PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003238PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3239 int surrogateescape)
3240{
3241 wchar_t smallbuf[256];
3242 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3243 wchar_t *wstr;
3244 size_t wlen, wlen2;
3245 PyObject *unicode;
3246
3247 if (str[len] != '\0' || len != strlen(str)) {
3248 PyErr_SetString(PyExc_TypeError, "embedded null character");
3249 return NULL;
3250 }
3251
3252 if (surrogateescape)
3253 {
3254 wstr = _Py_char2wchar(str, &wlen);
3255 if (wstr == NULL) {
3256 if (wlen == (size_t)-1)
3257 PyErr_NoMemory();
3258 else
3259 PyErr_SetFromErrno(PyExc_OSError);
3260 return NULL;
3261 }
3262
3263 unicode = PyUnicode_FromWideChar(wstr, wlen);
3264 PyMem_Free(wstr);
3265 }
3266 else {
3267#ifndef HAVE_BROKEN_MBSTOWCS
3268 wlen = mbstowcs(NULL, str, 0);
3269#else
3270 wlen = len;
3271#endif
3272 if (wlen == (size_t)-1) {
3273 PyErr_SetFromErrno(PyExc_OSError);
3274 return NULL;
3275 }
3276 if (wlen+1 <= smallbuf_len) {
3277 wstr = smallbuf;
3278 }
3279 else {
3280 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3281 return PyErr_NoMemory();
3282
3283 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3284 if (!wstr)
3285 return PyErr_NoMemory();
3286 }
3287
3288 /* This shouldn't fail now */
3289 wlen2 = mbstowcs(wstr, str, wlen+1);
3290 if (wlen2 == (size_t)-1) {
3291 if (wstr != smallbuf)
3292 PyMem_Free(wstr);
3293 PyErr_SetFromErrno(PyExc_OSError);
3294 return NULL;
3295 }
3296#ifdef HAVE_BROKEN_MBSTOWCS
3297 assert(wlen2 == wlen);
3298#endif
3299 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3300 if (wstr != smallbuf)
3301 PyMem_Free(wstr);
3302 }
3303 return unicode;
3304}
3305
3306PyObject*
3307PyUnicode_DecodeLocale(const char *str, int surrogateescape)
3308{
3309 Py_ssize_t size = (Py_ssize_t)strlen(str);
3310 return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
3311}
3312
3313
3314PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003315PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003316 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003317 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3318}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003319
Christian Heimes5894ba72007-11-04 11:43:14 +00003320PyObject*
3321PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3322{
Victor Stinner99b95382011-07-04 14:23:54 +02003323#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003324 return PyUnicode_DecodeMBCS(s, size, NULL);
3325#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003326 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003327#else
Victor Stinner793b5312011-04-27 00:24:21 +02003328 PyInterpreterState *interp = PyThreadState_GET()->interp;
3329 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3330 cannot use it to encode and decode filenames before it is loaded. Load
3331 the Python codec requires to encode at least its own filename. Use the C
3332 version of the locale codec until the codec registry is initialized and
3333 the Python codec is loaded.
3334
3335 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3336 cannot only rely on it: check also interp->fscodec_initialized for
3337 subinterpreters. */
3338 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003339 return PyUnicode_Decode(s, size,
3340 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003341 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003342 }
3343 else {
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003344 return PyUnicode_DecodeLocaleAndSize(s, size, 1);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003345 }
Victor Stinnerad158722010-10-27 00:25:46 +00003346#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003347}
3348
Martin v. Löwis011e8422009-05-05 04:43:17 +00003349
3350int
3351PyUnicode_FSConverter(PyObject* arg, void* addr)
3352{
3353 PyObject *output = NULL;
3354 Py_ssize_t size;
3355 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003356 if (arg == NULL) {
3357 Py_DECREF(*(PyObject**)addr);
3358 return 1;
3359 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003360 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003361 output = arg;
3362 Py_INCREF(output);
3363 }
3364 else {
3365 arg = PyUnicode_FromObject(arg);
3366 if (!arg)
3367 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003368 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003369 Py_DECREF(arg);
3370 if (!output)
3371 return 0;
3372 if (!PyBytes_Check(output)) {
3373 Py_DECREF(output);
3374 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3375 return 0;
3376 }
3377 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003378 size = PyBytes_GET_SIZE(output);
3379 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003380 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003381 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003382 Py_DECREF(output);
3383 return 0;
3384 }
3385 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003386 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003387}
3388
3389
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003390int
3391PyUnicode_FSDecoder(PyObject* arg, void* addr)
3392{
3393 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003394 if (arg == NULL) {
3395 Py_DECREF(*(PyObject**)addr);
3396 return 1;
3397 }
3398 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 if (PyUnicode_READY(arg))
3400 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003401 output = arg;
3402 Py_INCREF(output);
3403 }
3404 else {
3405 arg = PyBytes_FromObject(arg);
3406 if (!arg)
3407 return 0;
3408 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3409 PyBytes_GET_SIZE(arg));
3410 Py_DECREF(arg);
3411 if (!output)
3412 return 0;
3413 if (!PyUnicode_Check(output)) {
3414 Py_DECREF(output);
3415 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3416 return 0;
3417 }
3418 }
Victor Stinner065836e2011-10-27 01:56:33 +02003419 if (PyUnicode_READY(output) < 0) {
3420 Py_DECREF(output);
3421 return 0;
3422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003424 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003425 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3426 Py_DECREF(output);
3427 return 0;
3428 }
3429 *(PyObject**)addr = output;
3430 return Py_CLEANUP_SUPPORTED;
3431}
3432
3433
Martin v. Löwis5b222132007-06-10 09:51:05 +00003434char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003435PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003436{
Christian Heimesf3863112007-11-22 07:46:41 +00003437 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003438
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003439 if (!PyUnicode_Check(unicode)) {
3440 PyErr_BadArgument();
3441 return NULL;
3442 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003444 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003445
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003446 if (PyUnicode_UTF8(unicode) == NULL) {
3447 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003448 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3449 if (bytes == NULL)
3450 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003451 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3452 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 Py_DECREF(bytes);
3454 return NULL;
3455 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003456 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3457 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3458 PyBytes_AS_STRING(bytes),
3459 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460 Py_DECREF(bytes);
3461 }
3462
3463 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003464 *psize = PyUnicode_UTF8_LENGTH(unicode);
3465 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003466}
3467
3468char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3472}
3473
3474#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003475static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476#endif
3477
3478
3479Py_UNICODE *
3480PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003482 const unsigned char *one_byte;
3483#if SIZEOF_WCHAR_T == 4
3484 const Py_UCS2 *two_bytes;
3485#else
3486 const Py_UCS4 *four_bytes;
3487 const Py_UCS4 *ucs4_end;
3488 Py_ssize_t num_surrogates;
3489#endif
3490 wchar_t *w;
3491 wchar_t *wchar_end;
3492
3493 if (!PyUnicode_Check(unicode)) {
3494 PyErr_BadArgument();
3495 return NULL;
3496 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003497 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003499 assert(_PyUnicode_KIND(unicode) != 0);
3500 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501
3502#ifdef Py_DEBUG
3503 ++unicode_as_unicode_calls;
3504#endif
3505
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003506 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3509 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003510 num_surrogates = 0;
3511
3512 for (; four_bytes < ucs4_end; ++four_bytes) {
3513 if (*four_bytes > 0xFFFF)
3514 ++num_surrogates;
3515 }
3516
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003517 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3518 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3519 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003520 PyErr_NoMemory();
3521 return NULL;
3522 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003523 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003525 w = _PyUnicode_WSTR(unicode);
3526 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3527 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003528 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3529 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003530 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003532 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3533 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 }
3535 else
3536 *w = *four_bytes;
3537
3538 if (w > wchar_end) {
3539 assert(0 && "Miscalculated string end");
3540 }
3541 }
3542 *w = 0;
3543#else
3544 /* sizeof(wchar_t) == 4 */
3545 Py_FatalError("Impossible unicode object state, wstr and str "
3546 "should share memory already.");
3547 return NULL;
3548#endif
3549 }
3550 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003551 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3552 (_PyUnicode_LENGTH(unicode) + 1));
3553 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003554 PyErr_NoMemory();
3555 return NULL;
3556 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003557 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3558 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3559 w = _PyUnicode_WSTR(unicode);
3560 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003562 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3563 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 for (; w < wchar_end; ++one_byte, ++w)
3565 *w = *one_byte;
3566 /* null-terminate the wstr */
3567 *w = 0;
3568 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003569 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003570#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003571 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003572 for (; w < wchar_end; ++two_bytes, ++w)
3573 *w = *two_bytes;
3574 /* null-terminate the wstr */
3575 *w = 0;
3576#else
3577 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003578 PyObject_FREE(_PyUnicode_WSTR(unicode));
3579 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003580 Py_FatalError("Impossible unicode object state, wstr "
3581 "and str should share memory already.");
3582 return NULL;
3583#endif
3584 }
3585 else {
3586 assert(0 && "This should never happen.");
3587 }
3588 }
3589 }
3590 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003591 *size = PyUnicode_WSTR_LENGTH(unicode);
3592 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003593}
3594
Alexander Belopolsky40018472011-02-26 01:02:56 +00003595Py_UNICODE *
3596PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599}
3600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003601
Alexander Belopolsky40018472011-02-26 01:02:56 +00003602Py_ssize_t
3603PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604{
3605 if (!PyUnicode_Check(unicode)) {
3606 PyErr_BadArgument();
3607 goto onError;
3608 }
3609 return PyUnicode_GET_SIZE(unicode);
3610
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 return -1;
3613}
3614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003615Py_ssize_t
3616PyUnicode_GetLength(PyObject *unicode)
3617{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003618 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003619 PyErr_BadArgument();
3620 return -1;
3621 }
3622
3623 return PyUnicode_GET_LENGTH(unicode);
3624}
3625
3626Py_UCS4
3627PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3628{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003629 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3630 PyErr_BadArgument();
3631 return (Py_UCS4)-1;
3632 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003633 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003634 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003635 return (Py_UCS4)-1;
3636 }
3637 return PyUnicode_READ_CHAR(unicode, index);
3638}
3639
3640int
3641PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3642{
3643 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003644 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003645 return -1;
3646 }
Victor Stinner488fa492011-12-12 00:01:39 +01003647 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003648 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003649 PyErr_SetString(PyExc_IndexError, "string index out of range");
3650 return -1;
3651 }
Victor Stinner488fa492011-12-12 00:01:39 +01003652 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003653 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003654 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3655 index, ch);
3656 return 0;
3657}
3658
Alexander Belopolsky40018472011-02-26 01:02:56 +00003659const char *
3660PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003661{
Victor Stinner42cb4622010-09-01 19:39:01 +00003662 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003663}
3664
Victor Stinner554f3f02010-06-16 23:33:54 +00003665/* create or adjust a UnicodeDecodeError */
3666static void
3667make_decode_exception(PyObject **exceptionObject,
3668 const char *encoding,
3669 const char *input, Py_ssize_t length,
3670 Py_ssize_t startpos, Py_ssize_t endpos,
3671 const char *reason)
3672{
3673 if (*exceptionObject == NULL) {
3674 *exceptionObject = PyUnicodeDecodeError_Create(
3675 encoding, input, length, startpos, endpos, reason);
3676 }
3677 else {
3678 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3679 goto onError;
3680 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3681 goto onError;
3682 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3683 goto onError;
3684 }
3685 return;
3686
3687onError:
3688 Py_DECREF(*exceptionObject);
3689 *exceptionObject = NULL;
3690}
3691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692/* error handling callback helper:
3693 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003694 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 and adjust various state variables.
3696 return 0 on success, -1 on error
3697*/
3698
Alexander Belopolsky40018472011-02-26 01:02:56 +00003699static int
3700unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003701 const char *encoding, const char *reason,
3702 const char **input, const char **inend, Py_ssize_t *startinpos,
3703 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003704 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003705{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003706 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707
3708 PyObject *restuple = NULL;
3709 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003710 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003711 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003712 Py_ssize_t requiredsize;
3713 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003714 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 int res = -1;
3716
Victor Stinner596a6c42011-11-09 00:02:18 +01003717 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3718 outsize = PyUnicode_GET_LENGTH(*output);
3719 else
3720 outsize = _PyUnicode_WSTR_LENGTH(*output);
3721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003723 *errorHandler = PyCodec_LookupError(errors);
3724 if (*errorHandler == NULL)
3725 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 }
3727
Victor Stinner554f3f02010-06-16 23:33:54 +00003728 make_decode_exception(exceptionObject,
3729 encoding,
3730 *input, *inend - *input,
3731 *startinpos, *endinpos,
3732 reason);
3733 if (*exceptionObject == NULL)
3734 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735
3736 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3737 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003738 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003740 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 }
3743 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003744 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003745 if (PyUnicode_READY(repunicode) < 0)
3746 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003747
3748 /* Copy back the bytes variables, which might have been modified by the
3749 callback */
3750 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3751 if (!inputobj)
3752 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003753 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003755 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003756 *input = PyBytes_AS_STRING(inputobj);
3757 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003758 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003759 /* we can DECREF safely, as the exception has another reference,
3760 so the object won't go away. */
3761 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003765 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3767 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769
Victor Stinner596a6c42011-11-09 00:02:18 +01003770 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3771 /* need more space? (at least enough for what we
3772 have+the replacement+the rest of the string (starting
3773 at the new input position), so we won't have to check space
3774 when there are no errors in the rest of the string) */
3775 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3776 requiredsize = *outpos + replen + insize-newpos;
3777 if (requiredsize > outsize) {
3778 if (requiredsize<2*outsize)
3779 requiredsize = 2*outsize;
3780 if (unicode_resize(output, requiredsize) < 0)
3781 goto onError;
3782 }
3783 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003785 copy_characters(*output, *outpos, repunicode, 0, replen);
3786 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003788 else {
3789 wchar_t *repwstr;
3790 Py_ssize_t repwlen;
3791 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3792 if (repwstr == NULL)
3793 goto onError;
3794 /* need more space? (at least enough for what we
3795 have+the replacement+the rest of the string (starting
3796 at the new input position), so we won't have to check space
3797 when there are no errors in the rest of the string) */
3798 requiredsize = *outpos + repwlen + insize-newpos;
3799 if (requiredsize > outsize) {
3800 if (requiredsize < 2*outsize)
3801 requiredsize = 2*outsize;
3802 if (unicode_resize(output, requiredsize) < 0)
3803 goto onError;
3804 }
3805 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3806 *outpos += repwlen;
3807 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003809 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003810
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 /* we made it! */
3812 res = 0;
3813
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 Py_XDECREF(restuple);
3816 return res;
3817}
3818
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819/* --- UTF-7 Codec -------------------------------------------------------- */
3820
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821/* See RFC2152 for details. We encode conservatively and decode liberally. */
3822
3823/* Three simple macros defining base-64. */
3824
3825/* Is c a base-64 character? */
3826
3827#define IS_BASE64(c) \
3828 (((c) >= 'A' && (c) <= 'Z') || \
3829 ((c) >= 'a' && (c) <= 'z') || \
3830 ((c) >= '0' && (c) <= '9') || \
3831 (c) == '+' || (c) == '/')
3832
3833/* given that c is a base-64 character, what is its base-64 value? */
3834
3835#define FROM_BASE64(c) \
3836 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3837 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3838 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3839 (c) == '+' ? 62 : 63)
3840
3841/* What is the base-64 character of the bottom 6 bits of n? */
3842
3843#define TO_BASE64(n) \
3844 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3845
3846/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3847 * decoded as itself. We are permissive on decoding; the only ASCII
3848 * byte not decoding to itself is the + which begins a base64
3849 * string. */
3850
3851#define DECODE_DIRECT(c) \
3852 ((c) <= 127 && (c) != '+')
3853
3854/* The UTF-7 encoder treats ASCII characters differently according to
3855 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3856 * the above). See RFC2152. This array identifies these different
3857 * sets:
3858 * 0 : "Set D"
3859 * alphanumeric and '(),-./:?
3860 * 1 : "Set O"
3861 * !"#$%&*;<=>@[]^_`{|}
3862 * 2 : "whitespace"
3863 * ht nl cr sp
3864 * 3 : special (must be base64 encoded)
3865 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3866 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003867
Tim Petersced69f82003-09-16 20:30:58 +00003868static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003869char utf7_category[128] = {
3870/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3871 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3872/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3873 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3874/* sp ! " # $ % & ' ( ) * + , - . / */
3875 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3876/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3878/* @ A B C D E F G H I J K L M N O */
3879 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3880/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3881 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3882/* ` a b c d e f g h i j k l m n o */
3883 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3884/* p q r s t u v w x y z { | } ~ del */
3885 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886};
3887
Antoine Pitrou244651a2009-05-04 18:56:13 +00003888/* ENCODE_DIRECT: this character should be encoded as itself. The
3889 * answer depends on whether we are encoding set O as itself, and also
3890 * on whether we are encoding whitespace as itself. RFC2152 makes it
3891 * clear that the answers to these questions vary between
3892 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003893
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894#define ENCODE_DIRECT(c, directO, directWS) \
3895 ((c) < 128 && (c) > 0 && \
3896 ((utf7_category[(c)] == 0) || \
3897 (directWS && (utf7_category[(c)] == 2)) || \
3898 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899
Alexander Belopolsky40018472011-02-26 01:02:56 +00003900PyObject *
3901PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003902 Py_ssize_t size,
3903 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003904{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003905 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3906}
3907
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908/* The decoder. The only state we preserve is our read position,
3909 * i.e. how many characters we have consumed. So if we end in the
3910 * middle of a shift sequence we have to back off the read position
3911 * and the output to the beginning of the sequence, otherwise we lose
3912 * all the shift state (seen bits, number of bits seen, high
3913 * surrogate). */
3914
Alexander Belopolsky40018472011-02-26 01:02:56 +00003915PyObject *
3916PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003917 Py_ssize_t size,
3918 const char *errors,
3919 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003920{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003922 Py_ssize_t startinpos;
3923 Py_ssize_t endinpos;
3924 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003925 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003926 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003927 const char *errmsg = "";
3928 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003929 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 unsigned int base64bits = 0;
3931 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003932 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 PyObject *errorHandler = NULL;
3934 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003936 /* Start off assuming it's all ASCII. Widen later as necessary. */
3937 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003938 if (!unicode)
3939 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003940 if (size == 0) {
3941 if (consumed)
3942 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003943 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003944 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003945
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003946 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003947 e = s + size;
3948
3949 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003950 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003951 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003952 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003953
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954 if (inShift) { /* in a base-64 section */
3955 if (IS_BASE64(ch)) { /* consume a base-64 character */
3956 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3957 base64bits += 6;
3958 s++;
3959 if (base64bits >= 16) {
3960 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003961 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003962 base64bits -= 16;
3963 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3964 if (surrogate) {
3965 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003966 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3967 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3969 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003971 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972 }
3973 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003974 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3975 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003976 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 }
3978 }
Victor Stinner551ac952011-11-29 22:58:13 +01003979 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 /* first surrogate */
3981 surrogate = outCh;
3982 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003984 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3985 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003986 }
3987 }
3988 }
3989 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 inShift = 0;
3991 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003992 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003993 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3994 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003995 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003996 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 if (base64bits > 0) { /* left-over bits */
3998 if (base64bits >= 6) {
3999 /* We've seen at least one base-64 character */
4000 errmsg = "partial character in shift sequence";
4001 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004003 else {
4004 /* Some bits remain; they should be zero */
4005 if (base64buffer != 0) {
4006 errmsg = "non-zero padding bits in shift sequence";
4007 goto utf7Error;
4008 }
4009 }
4010 }
4011 if (ch != '-') {
4012 /* '-' is absorbed; other terminating
4013 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004014 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4015 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 }
4018 }
4019 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021 s++; /* consume '+' */
4022 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004024 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4025 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 }
4027 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004029 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004031 }
4032 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004034 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4035 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036 s++;
4037 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004038 else {
4039 startinpos = s-starts;
4040 s++;
4041 errmsg = "unexpected special character";
4042 goto utf7Error;
4043 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004044 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004045utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004046 endinpos = s-starts;
4047 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004048 errors, &errorHandler,
4049 "utf7", errmsg,
4050 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004051 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004052 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004053 }
4054
Antoine Pitrou244651a2009-05-04 18:56:13 +00004055 /* end of string */
4056
4057 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4058 /* if we're in an inconsistent state, that's an error */
4059 if (surrogate ||
4060 (base64bits >= 6) ||
4061 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004062 endinpos = size;
4063 if (unicode_decode_call_errorhandler(
4064 errors, &errorHandler,
4065 "utf7", "unterminated shift sequence",
4066 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004067 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004068 goto onError;
4069 if (s < e)
4070 goto restart;
4071 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004072 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004073
4074 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004075 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004077 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004078 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079 }
4080 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004081 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004082 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004083 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004085 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004086 goto onError;
4087
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 Py_XDECREF(errorHandler);
4089 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004090 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091
Benjamin Peterson29060642009-01-31 22:14:21 +00004092 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093 Py_XDECREF(errorHandler);
4094 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004095 Py_DECREF(unicode);
4096 return NULL;
4097}
4098
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004101_PyUnicode_EncodeUTF7(PyObject *str,
4102 int base64SetO,
4103 int base64WhiteSpace,
4104 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004105{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004106 int kind;
4107 void *data;
4108 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004109 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004110 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004111 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004112 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004113 unsigned int base64bits = 0;
4114 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004115 char * out;
4116 char * start;
4117
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004118 if (PyUnicode_READY(str) < 0)
4119 return NULL;
4120 kind = PyUnicode_KIND(str);
4121 data = PyUnicode_DATA(str);
4122 len = PyUnicode_GET_LENGTH(str);
4123
4124 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004126
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004127 /* It might be possible to tighten this worst case */
4128 allocated = 8 * len;
4129 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004130 return PyErr_NoMemory();
4131
Antoine Pitrou244651a2009-05-04 18:56:13 +00004132 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004133 if (v == NULL)
4134 return NULL;
4135
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004136 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004137 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004138 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004139
Antoine Pitrou244651a2009-05-04 18:56:13 +00004140 if (inShift) {
4141 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4142 /* shifting out */
4143 if (base64bits) { /* output remaining bits */
4144 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4145 base64buffer = 0;
4146 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147 }
4148 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004149 /* Characters not in the BASE64 set implicitly unshift the sequence
4150 so no '-' is required, except if the character is itself a '-' */
4151 if (IS_BASE64(ch) || ch == '-') {
4152 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004153 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004154 *out++ = (char) ch;
4155 }
4156 else {
4157 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004158 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004159 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160 else { /* not in a shift sequence */
4161 if (ch == '+') {
4162 *out++ = '+';
4163 *out++ = '-';
4164 }
4165 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4166 *out++ = (char) ch;
4167 }
4168 else {
4169 *out++ = '+';
4170 inShift = 1;
4171 goto encode_char;
4172 }
4173 }
4174 continue;
4175encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004176 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004177 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004178
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179 /* code first surrogate */
4180 base64bits += 16;
4181 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4182 while (base64bits >= 6) {
4183 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4184 base64bits -= 6;
4185 }
4186 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004187 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004188 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004189 base64bits += 16;
4190 base64buffer = (base64buffer << 16) | ch;
4191 while (base64bits >= 6) {
4192 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4193 base64bits -= 6;
4194 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004195 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004196 if (base64bits)
4197 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4198 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004199 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004200 if (_PyBytes_Resize(&v, out - start) < 0)
4201 return NULL;
4202 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004203}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004204PyObject *
4205PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4206 Py_ssize_t size,
4207 int base64SetO,
4208 int base64WhiteSpace,
4209 const char *errors)
4210{
4211 PyObject *result;
4212 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4213 if (tmp == NULL)
4214 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004215 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004216 base64WhiteSpace, errors);
4217 Py_DECREF(tmp);
4218 return result;
4219}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004220
Antoine Pitrou244651a2009-05-04 18:56:13 +00004221#undef IS_BASE64
4222#undef FROM_BASE64
4223#undef TO_BASE64
4224#undef DECODE_DIRECT
4225#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004226
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227/* --- UTF-8 Codec -------------------------------------------------------- */
4228
Tim Petersced69f82003-09-16 20:30:58 +00004229static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004231 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4232 illegal prefix. See RFC 3629 for details */
4233 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4234 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4245 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4248 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249};
4250
Alexander Belopolsky40018472011-02-26 01:02:56 +00004251PyObject *
4252PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004253 Py_ssize_t size,
4254 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255{
Walter Dörwald69652032004-09-07 20:24:22 +00004256 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4257}
4258
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004259#include "stringlib/ucs1lib.h"
4260#include "stringlib/codecs.h"
4261#include "stringlib/undef.h"
4262
4263#include "stringlib/ucs2lib.h"
4264#include "stringlib/codecs.h"
4265#include "stringlib/undef.h"
4266
4267#include "stringlib/ucs4lib.h"
4268#include "stringlib/codecs.h"
4269#include "stringlib/undef.h"
4270
Antoine Pitrouab868312009-01-10 15:40:25 +00004271/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4272#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4273
4274/* Mask to quickly check whether a C 'long' contains a
4275 non-ASCII, UTF8-encoded char. */
4276#if (SIZEOF_LONG == 8)
4277# define ASCII_CHAR_MASK 0x8080808080808080L
4278#elif (SIZEOF_LONG == 4)
4279# define ASCII_CHAR_MASK 0x80808080L
4280#else
4281# error C 'long' size should be either 4 or 8!
4282#endif
4283
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004284/* Scans a UTF-8 string and returns the maximum character to be expected
4285 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289 */
4290static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004291utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 const unsigned char *end = p + string_size;
4295 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004297 assert(unicode_size != NULL);
4298
4299 /* By having a cascade of independent loops which fallback onto each
4300 other, we minimize the amount of work done in the average loop
4301 iteration, and we also maximize the CPU's ability to predict
4302 branches correctly (because a given condition will have always the
4303 same boolean outcome except perhaps in the last iteration of the
4304 corresponding loop).
4305 In the general case this brings us rather close to decoding
4306 performance pre-PEP 393, despite the two-pass decoding.
4307
4308 Note that the pure ASCII loop is not duplicated once a non-ASCII
4309 character has been encountered. It is actually a pessimization (by
4310 a significant factor) to use this loop on text with many non-ASCII
4311 characters, and it is important to avoid bad performance on valid
4312 utf-8 data (invalid utf-8 being a different can of worms).
4313 */
4314
4315 /* ASCII */
4316 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 /* Only check value if it's not a ASCII char... */
4318 if (*p < 0x80) {
4319 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4320 an explanation. */
4321 if (!((size_t) p & LONG_PTR_MASK)) {
4322 /* Help register allocation */
4323 register const unsigned char *_p = p;
4324 while (_p < aligned_end) {
4325 unsigned long value = *(unsigned long *) _p;
4326 if (value & ASCII_CHAR_MASK)
4327 break;
4328 _p += SIZEOF_LONG;
4329 char_count += SIZEOF_LONG;
4330 }
4331 p = _p;
4332 if (p == end)
4333 break;
4334 }
4335 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004336 if (*p < 0x80)
4337 ++char_count;
4338 else
4339 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004341 *unicode_size = char_count;
4342 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004344_ucs1loop:
4345 for (; p < end; ++p) {
4346 if (*p < 0xc4)
4347 char_count += ((*p & 0xc0) != 0x80);
4348 else
4349 goto _ucs2loop;
4350 }
4351 *unicode_size = char_count;
4352 return 255;
4353
4354_ucs2loop:
4355 for (; p < end; ++p) {
4356 if (*p < 0xf0)
4357 char_count += ((*p & 0xc0) != 0x80);
4358 else
4359 goto _ucs4loop;
4360 }
4361 *unicode_size = char_count;
4362 return 65535;
4363
4364_ucs4loop:
4365 for (; p < end; ++p) {
4366 char_count += ((*p & 0xc0) != 0x80);
4367 }
4368 *unicode_size = char_count;
4369 return 65537;
4370}
4371
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004373 in case of errors. Implicit parameters: unicode, kind, data, onError.
4374 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004375*/
Victor Stinner785938e2011-12-11 20:09:03 +01004376#define WRITE_MAYBE_FAIL(index, value) \
4377 do { \
4378 Py_ssize_t pos = index; \
4379 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4380 unicode_resize(&unicode, pos + pos/8) < 0) \
4381 goto onError; \
4382 if (unicode_putchar(&unicode, &pos, value) < 0) \
4383 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 } while (0)
4385
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004386static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004387decode_utf8_errors(const char *starts,
4388 Py_ssize_t size,
4389 const char *errors,
4390 Py_ssize_t *consumed,
4391 const char *s,
4392 PyObject *unicode,
4393 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004394{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004396 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004397 Py_ssize_t startinpos;
4398 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004399 const char *e = starts + size;
4400 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004401 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 PyObject *errorHandler = NULL;
4403 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004404
Antoine Pitrouab868312009-01-10 15:40:25 +00004405 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406
4407 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004408 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409
4410 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004411 /* Fast path for runs of ASCII characters. Given that common UTF-8
4412 input will consist of an overwhelming majority of ASCII
4413 characters, we try to optimize for this case by checking
4414 as many characters as a C 'long' can contain.
4415 First, check if we can do an aligned read, as most CPUs have
4416 a penalty for unaligned reads.
4417 */
4418 if (!((size_t) s & LONG_PTR_MASK)) {
4419 /* Help register allocation */
4420 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004421 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004422 while (_s < aligned_end) {
4423 /* Read a whole long at a time (either 4 or 8 bytes),
4424 and do a fast unrolled copy if it only contains ASCII
4425 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004426 unsigned long value = *(unsigned long *) _s;
4427 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004428 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004429 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4430 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4431 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4432 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004433#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004434 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4435 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4436 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4437 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004438#endif
4439 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004441 }
4442 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004443 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004444 if (s == e)
4445 break;
4446 ch = (unsigned char)*s;
4447 }
4448 }
4449
4450 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004451 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 s++;
4453 continue;
4454 }
4455
4456 n = utf8_code_length[ch];
4457
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004458 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 if (consumed)
4460 break;
4461 else {
4462 errmsg = "unexpected end of data";
4463 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004464 endinpos = startinpos+1;
4465 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4466 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004467 goto utf8Error;
4468 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470
4471 switch (n) {
4472
4473 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004474 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 startinpos = s-starts;
4476 endinpos = startinpos+1;
4477 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478
4479 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004480 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 startinpos = s-starts;
4482 endinpos = startinpos+1;
4483 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484
4485 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004486 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004487 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004489 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 goto utf8Error;
4491 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004493 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004494 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 break;
4496
4497 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004498 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4499 will result in surrogates in range d800-dfff. Surrogates are
4500 not valid UTF-8 so they are rejected.
4501 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4502 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004503 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004504 (s[2] & 0xc0) != 0x80 ||
4505 ((unsigned char)s[0] == 0xE0 &&
4506 (unsigned char)s[1] < 0xA0) ||
4507 ((unsigned char)s[0] == 0xED &&
4508 (unsigned char)s[1] > 0x9F)) {
4509 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004511 endinpos = startinpos + 1;
4512
4513 /* if s[1] first two bits are 1 and 0, then the invalid
4514 continuation byte is s[2], so increment endinpos by 1,
4515 if not, s[1] is invalid and endinpos doesn't need to
4516 be incremented. */
4517 if ((s[1] & 0xC0) == 0x80)
4518 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 goto utf8Error;
4520 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004522 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004524 break;
4525
4526 case 4:
4527 if ((s[1] & 0xc0) != 0x80 ||
4528 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004529 (s[3] & 0xc0) != 0x80 ||
4530 ((unsigned char)s[0] == 0xF0 &&
4531 (unsigned char)s[1] < 0x90) ||
4532 ((unsigned char)s[0] == 0xF4 &&
4533 (unsigned char)s[1] > 0x8F)) {
4534 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004536 endinpos = startinpos + 1;
4537 if ((s[1] & 0xC0) == 0x80) {
4538 endinpos++;
4539 if ((s[2] & 0xC0) == 0x80)
4540 endinpos++;
4541 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 goto utf8Error;
4543 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004544 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004546 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004547
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004548 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 }
4551 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004553
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 if (unicode_decode_call_errorhandler(
4556 errors, &errorHandler,
4557 "utf8", errmsg,
4558 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004559 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004561 /* Update data because unicode_decode_call_errorhandler might have
4562 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004564 }
Walter Dörwald69652032004-09-07 20:24:22 +00004565 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004566 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004568 /* Adjust length and ready string when it contained errors and
4569 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004570 if (unicode_resize(&unicode, i) < 0)
4571 goto onError;
4572 unicode_adjust_maxchar(&unicode);
4573 if (unicode == NULL)
4574 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004578 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004579 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 Py_XDECREF(errorHandler);
4583 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004584 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 return NULL;
4586}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004587#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004588
Victor Stinner785938e2011-12-11 20:09:03 +01004589PyObject *
4590PyUnicode_DecodeUTF8Stateful(const char *s,
4591 Py_ssize_t size,
4592 const char *errors,
4593 Py_ssize_t *consumed)
4594{
4595 Py_UCS4 maxchar = 0;
4596 Py_ssize_t unicode_size;
4597 int has_errors = 0;
4598 PyObject *unicode;
4599 int kind;
4600 void *data;
4601 const char *starts = s;
4602 const char *e;
4603 Py_ssize_t i;
4604
4605 if (size == 0) {
4606 if (consumed)
4607 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004608 Py_INCREF(unicode_empty);
4609 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004610 }
4611
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004612 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004613
4614 /* When the string is ASCII only, just use memcpy and return.
4615 unicode_size may be != size if there is an incomplete UTF-8
4616 sequence at the end of the ASCII block. */
4617 if (maxchar < 128 && size == unicode_size) {
4618 if (consumed)
4619 *consumed = size;
4620 return unicode_fromascii(s, size);
4621 }
4622
4623 unicode = PyUnicode_New(unicode_size, maxchar);
4624 if (!unicode)
4625 return NULL;
4626 kind = PyUnicode_KIND(unicode);
4627 data = PyUnicode_DATA(unicode);
4628
4629 /* Unpack UTF-8 encoded data */
4630 i = 0;
4631 e = starts + size;
4632 switch (kind) {
4633 case PyUnicode_1BYTE_KIND:
4634 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4635 break;
4636 case PyUnicode_2BYTE_KIND:
4637 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4638 break;
4639 case PyUnicode_4BYTE_KIND:
4640 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4641 break;
4642 }
4643 if (!has_errors) {
4644 /* Ensure the unicode size calculation was correct */
4645 assert(i == unicode_size);
4646 assert(s == e);
4647 if (consumed)
4648 *consumed = size;
4649 return unicode;
4650 }
4651
4652 /* In case of errors, maxchar and size computation might be incorrect;
4653 code below refits and resizes as necessary. */
4654 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4655}
4656
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004657#ifdef __APPLE__
4658
4659/* Simplified UTF-8 decoder using surrogateescape error handler,
4660 used to decode the command line arguments on Mac OS X. */
4661
4662wchar_t*
4663_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4664{
4665 int n;
4666 const char *e;
4667 wchar_t *unicode, *p;
4668
4669 /* Note: size will always be longer than the resulting Unicode
4670 character count */
4671 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4672 PyErr_NoMemory();
4673 return NULL;
4674 }
4675 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4676 if (!unicode)
4677 return NULL;
4678
4679 /* Unpack UTF-8 encoded data */
4680 p = unicode;
4681 e = s + size;
4682 while (s < e) {
4683 Py_UCS4 ch = (unsigned char)*s;
4684
4685 if (ch < 0x80) {
4686 *p++ = (wchar_t)ch;
4687 s++;
4688 continue;
4689 }
4690
4691 n = utf8_code_length[ch];
4692 if (s + n > e) {
4693 goto surrogateescape;
4694 }
4695
4696 switch (n) {
4697 case 0:
4698 case 1:
4699 goto surrogateescape;
4700
4701 case 2:
4702 if ((s[1] & 0xc0) != 0x80)
4703 goto surrogateescape;
4704 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4705 assert ((ch > 0x007F) && (ch <= 0x07FF));
4706 *p++ = (wchar_t)ch;
4707 break;
4708
4709 case 3:
4710 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4711 will result in surrogates in range d800-dfff. Surrogates are
4712 not valid UTF-8 so they are rejected.
4713 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4714 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4715 if ((s[1] & 0xc0) != 0x80 ||
4716 (s[2] & 0xc0) != 0x80 ||
4717 ((unsigned char)s[0] == 0xE0 &&
4718 (unsigned char)s[1] < 0xA0) ||
4719 ((unsigned char)s[0] == 0xED &&
4720 (unsigned char)s[1] > 0x9F)) {
4721
4722 goto surrogateescape;
4723 }
4724 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4725 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004726 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004727 break;
4728
4729 case 4:
4730 if ((s[1] & 0xc0) != 0x80 ||
4731 (s[2] & 0xc0) != 0x80 ||
4732 (s[3] & 0xc0) != 0x80 ||
4733 ((unsigned char)s[0] == 0xF0 &&
4734 (unsigned char)s[1] < 0x90) ||
4735 ((unsigned char)s[0] == 0xF4 &&
4736 (unsigned char)s[1] > 0x8F)) {
4737 goto surrogateescape;
4738 }
4739 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4740 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004741 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742
4743#if SIZEOF_WCHAR_T == 4
4744 *p++ = (wchar_t)ch;
4745#else
4746 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004747 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4748 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004749#endif
4750 break;
4751 }
4752 s += n;
4753 continue;
4754
4755 surrogateescape:
4756 *p++ = 0xDC00 + ch;
4757 s++;
4758 }
4759 *p = L'\0';
4760 return unicode;
4761}
4762
4763#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765/* Primary internal function which creates utf8 encoded bytes objects.
4766
4767 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004768 and allocate exactly as much space needed at the end. Else allocate the
4769 maximum possible needed (4 result bytes per Unicode character), and return
4770 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004771*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004772PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004773_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Tim Peters602f7402002-04-27 18:03:26 +00004775#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004776
Guido van Rossum98297ee2007-11-06 21:34:58 +00004777 Py_ssize_t i; /* index into s of next input byte */
4778 PyObject *result; /* result string object */
4779 char *p; /* next free byte in output buffer */
4780 Py_ssize_t nallocated; /* number of result bytes allocated */
4781 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004782 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004783 PyObject *errorHandler = NULL;
4784 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 int kind;
4786 void *data;
4787 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004788 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 if (!PyUnicode_Check(unicode)) {
4791 PyErr_BadArgument();
4792 return NULL;
4793 }
4794
4795 if (PyUnicode_READY(unicode) == -1)
4796 return NULL;
4797
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004798 if (PyUnicode_UTF8(unicode))
4799 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4800 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004801
4802 kind = PyUnicode_KIND(unicode);
4803 data = PyUnicode_DATA(unicode);
4804 size = PyUnicode_GET_LENGTH(unicode);
4805
Tim Peters602f7402002-04-27 18:03:26 +00004806 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Tim Peters602f7402002-04-27 18:03:26 +00004808 if (size <= MAX_SHORT_UNICHARS) {
4809 /* Write into the stack buffer; nallocated can't overflow.
4810 * At the end, we'll allocate exactly as much heap space as it
4811 * turns out we need.
4812 */
4813 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004814 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004815 p = stackbuf;
4816 }
4817 else {
4818 /* Overallocate on the heap, and give the excess back at the end. */
4819 nallocated = size * 4;
4820 if (nallocated / 4 != size) /* overflow! */
4821 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004822 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004823 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004824 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004825 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004826 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004827
Tim Peters602f7402002-04-27 18:03:26 +00004828 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004830
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004831 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004832 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004834
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004836 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004837 *p++ = (char)(0xc0 | (ch >> 6));
4838 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004839 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004840 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004841 Py_ssize_t repsize, k, startpos;
4842 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 rep = unicode_encode_call_errorhandler(
4844 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004845 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846 if (!rep)
4847 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849 if (PyBytes_Check(rep))
4850 repsize = PyBytes_GET_SIZE(rep);
4851 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004852 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853
4854 if (repsize > 4) {
4855 Py_ssize_t offset;
4856
4857 if (result == NULL)
4858 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004859 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004862 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4863 /* integer overflow */
4864 PyErr_NoMemory();
4865 goto error;
4866 }
4867 nallocated += repsize - 4;
4868 if (result != NULL) {
4869 if (_PyBytes_Resize(&result, nallocated) < 0)
4870 goto error;
4871 } else {
4872 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004873 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004874 goto error;
4875 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4876 }
4877 p = PyBytes_AS_STRING(result) + offset;
4878 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880 if (PyBytes_Check(rep)) {
4881 char *prep = PyBytes_AS_STRING(rep);
4882 for(k = repsize; k > 0; k--)
4883 *p++ = *prep++;
4884 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004885 enum PyUnicode_Kind repkind;
4886 void *repdata;
4887
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004888 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004889 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004890 repkind = PyUnicode_KIND(rep);
4891 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892
4893 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004894 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004896 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004897 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004898 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004899 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004900 goto error;
4901 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004902 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004903 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004904 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004905 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004906 } else if (ch < 0x10000) {
4907 *p++ = (char)(0xe0 | (ch >> 12));
4908 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4909 *p++ = (char)(0x80 | (ch & 0x3f));
4910 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004911 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004912 /* Encode UCS4 Unicode ordinals */
4913 *p++ = (char)(0xf0 | (ch >> 18));
4914 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4915 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4916 *p++ = (char)(0x80 | (ch & 0x3f));
4917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004919
Guido van Rossum98297ee2007-11-06 21:34:58 +00004920 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004921 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004922 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004923 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004924 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004925 }
4926 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004927 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004928 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004929 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004930 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004932
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004933 Py_XDECREF(errorHandler);
4934 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004935 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004936 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004937 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004938 Py_XDECREF(errorHandler);
4939 Py_XDECREF(exc);
4940 Py_XDECREF(result);
4941 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004942
Tim Peters602f7402002-04-27 18:03:26 +00004943#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944}
4945
Alexander Belopolsky40018472011-02-26 01:02:56 +00004946PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4948 Py_ssize_t size,
4949 const char *errors)
4950{
4951 PyObject *v, *unicode;
4952
4953 unicode = PyUnicode_FromUnicode(s, size);
4954 if (unicode == NULL)
4955 return NULL;
4956 v = _PyUnicode_AsUTF8String(unicode, errors);
4957 Py_DECREF(unicode);
4958 return v;
4959}
4960
4961PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004962PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004964 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965}
4966
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967/* --- UTF-32 Codec ------------------------------------------------------- */
4968
4969PyObject *
4970PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 Py_ssize_t size,
4972 const char *errors,
4973 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974{
4975 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4976}
4977
4978PyObject *
4979PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 Py_ssize_t size,
4981 const char *errors,
4982 int *byteorder,
4983 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004984{
4985 const char *starts = s;
4986 Py_ssize_t startinpos;
4987 Py_ssize_t endinpos;
4988 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004989 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004990 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 int bo = 0; /* assume native ordering by default */
4992 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993 /* Offsets from q for retrieving bytes in the right order. */
4994#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4995 int iorder[] = {0, 1, 2, 3};
4996#else
4997 int iorder[] = {3, 2, 1, 0};
4998#endif
4999 PyObject *errorHandler = NULL;
5000 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005001
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002 q = (unsigned char *)s;
5003 e = q + size;
5004
5005 if (byteorder)
5006 bo = *byteorder;
5007
5008 /* Check for BOM marks (U+FEFF) in the input and adjust current
5009 byte order setting accordingly. In native mode, the leading BOM
5010 mark is skipped, in all other modes, it is copied to the output
5011 stream as-is (giving a ZWNBSP character). */
5012 if (bo == 0) {
5013 if (size >= 4) {
5014 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005016#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 if (bom == 0x0000FEFF) {
5018 q += 4;
5019 bo = -1;
5020 }
5021 else if (bom == 0xFFFE0000) {
5022 q += 4;
5023 bo = 1;
5024 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 if (bom == 0x0000FEFF) {
5027 q += 4;
5028 bo = 1;
5029 }
5030 else if (bom == 0xFFFE0000) {
5031 q += 4;
5032 bo = -1;
5033 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 }
5037
5038 if (bo == -1) {
5039 /* force LE */
5040 iorder[0] = 0;
5041 iorder[1] = 1;
5042 iorder[2] = 2;
5043 iorder[3] = 3;
5044 }
5045 else if (bo == 1) {
5046 /* force BE */
5047 iorder[0] = 3;
5048 iorder[1] = 2;
5049 iorder[2] = 1;
5050 iorder[3] = 0;
5051 }
5052
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005053 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005054 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005055 if (!unicode)
5056 return NULL;
5057 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005058 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005059 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005060
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005062 Py_UCS4 ch;
5063 /* remaining bytes at the end? (size should be divisible by 4) */
5064 if (e-q<4) {
5065 if (consumed)
5066 break;
5067 errmsg = "truncated data";
5068 startinpos = ((const char *)q)-starts;
5069 endinpos = ((const char *)e)-starts;
5070 goto utf32Error;
5071 /* The remaining input chars are ignored if the callback
5072 chooses to skip the input */
5073 }
5074 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5075 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 if (ch >= 0x110000)
5078 {
5079 errmsg = "codepoint not in range(0x110000)";
5080 startinpos = ((const char *)q)-starts;
5081 endinpos = startinpos+4;
5082 goto utf32Error;
5083 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005084 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5085 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 q += 4;
5087 continue;
5088 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 if (unicode_decode_call_errorhandler(
5090 errors, &errorHandler,
5091 "utf32", errmsg,
5092 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005093 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 }
5096
5097 if (byteorder)
5098 *byteorder = bo;
5099
5100 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102
5103 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005104 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 goto onError;
5106
5107 Py_XDECREF(errorHandler);
5108 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005109 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 Py_DECREF(unicode);
5113 Py_XDECREF(errorHandler);
5114 Py_XDECREF(exc);
5115 return NULL;
5116}
5117
5118PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119_PyUnicode_EncodeUTF32(PyObject *str,
5120 const char *errors,
5121 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005122{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005123 int kind;
5124 void *data;
5125 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005126 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005127 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129 /* Offsets from p for storing byte pairs in the right order. */
5130#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5131 int iorder[] = {0, 1, 2, 3};
5132#else
5133 int iorder[] = {3, 2, 1, 0};
5134#endif
5135
Benjamin Peterson29060642009-01-31 22:14:21 +00005136#define STORECHAR(CH) \
5137 do { \
5138 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5139 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5140 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5141 p[iorder[0]] = (CH) & 0xff; \
5142 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143 } while(0)
5144
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005145 if (!PyUnicode_Check(str)) {
5146 PyErr_BadArgument();
5147 return NULL;
5148 }
5149 if (PyUnicode_READY(str) < 0)
5150 return NULL;
5151 kind = PyUnicode_KIND(str);
5152 data = PyUnicode_DATA(str);
5153 len = PyUnicode_GET_LENGTH(str);
5154
5155 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005156 bytesize = nsize * 4;
5157 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005159 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160 if (v == NULL)
5161 return NULL;
5162
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005163 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005166 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005167 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005168
5169 if (byteorder == -1) {
5170 /* force LE */
5171 iorder[0] = 0;
5172 iorder[1] = 1;
5173 iorder[2] = 2;
5174 iorder[3] = 3;
5175 }
5176 else if (byteorder == 1) {
5177 /* force BE */
5178 iorder[0] = 3;
5179 iorder[1] = 2;
5180 iorder[2] = 1;
5181 iorder[3] = 0;
5182 }
5183
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005184 for (i = 0; i < len; i++)
5185 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005186
5187 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005188 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005189#undef STORECHAR
5190}
5191
Alexander Belopolsky40018472011-02-26 01:02:56 +00005192PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005193PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5194 Py_ssize_t size,
5195 const char *errors,
5196 int byteorder)
5197{
5198 PyObject *result;
5199 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5200 if (tmp == NULL)
5201 return NULL;
5202 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5203 Py_DECREF(tmp);
5204 return result;
5205}
5206
5207PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005208PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005209{
Victor Stinnerb960b342011-11-20 19:12:52 +01005210 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005211}
5212
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213/* --- UTF-16 Codec ------------------------------------------------------- */
5214
Tim Peters772747b2001-08-09 22:21:55 +00005215PyObject *
5216PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005217 Py_ssize_t size,
5218 const char *errors,
5219 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220{
Walter Dörwald69652032004-09-07 20:24:22 +00005221 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5222}
5223
Antoine Pitrouab868312009-01-10 15:40:25 +00005224/* Two masks for fast checking of whether a C 'long' may contain
5225 UTF16-encoded surrogate characters. This is an efficient heuristic,
5226 assuming that non-surrogate characters with a code point >= 0x8000 are
5227 rare in most input.
5228 FAST_CHAR_MASK is used when the input is in native byte ordering,
5229 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005230*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005231#if (SIZEOF_LONG == 8)
5232# define FAST_CHAR_MASK 0x8000800080008000L
5233# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5234#elif (SIZEOF_LONG == 4)
5235# define FAST_CHAR_MASK 0x80008000L
5236# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5237#else
5238# error C 'long' size should be either 4 or 8!
5239#endif
5240
Walter Dörwald69652032004-09-07 20:24:22 +00005241PyObject *
5242PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 Py_ssize_t size,
5244 const char *errors,
5245 int *byteorder,
5246 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005248 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249 Py_ssize_t startinpos;
5250 Py_ssize_t endinpos;
5251 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005252 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005253 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005254 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005255 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005256 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005257 /* Offsets from q for retrieving byte pairs in the right order. */
5258#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5259 int ihi = 1, ilo = 0;
5260#else
5261 int ihi = 0, ilo = 1;
5262#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263 PyObject *errorHandler = NULL;
5264 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265
5266 /* Note: size will always be longer than the resulting Unicode
5267 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005268 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 if (!unicode)
5270 return NULL;
5271 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005272 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005273 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
Tim Peters772747b2001-08-09 22:21:55 +00005275 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005276 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
5278 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005279 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005281 /* Check for BOM marks (U+FEFF) in the input and adjust current
5282 byte order setting accordingly. In native mode, the leading BOM
5283 mark is skipped, in all other modes, it is copied to the output
5284 stream as-is (giving a ZWNBSP character). */
5285 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005286 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005287 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005288#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005289 if (bom == 0xFEFF) {
5290 q += 2;
5291 bo = -1;
5292 }
5293 else if (bom == 0xFFFE) {
5294 q += 2;
5295 bo = 1;
5296 }
Tim Petersced69f82003-09-16 20:30:58 +00005297#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005298 if (bom == 0xFEFF) {
5299 q += 2;
5300 bo = 1;
5301 }
5302 else if (bom == 0xFFFE) {
5303 q += 2;
5304 bo = -1;
5305 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005306#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309
Tim Peters772747b2001-08-09 22:21:55 +00005310 if (bo == -1) {
5311 /* force LE */
5312 ihi = 1;
5313 ilo = 0;
5314 }
5315 else if (bo == 1) {
5316 /* force BE */
5317 ihi = 0;
5318 ilo = 1;
5319 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005320#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5321 native_ordering = ilo < ihi;
5322#else
5323 native_ordering = ilo > ihi;
5324#endif
Tim Peters772747b2001-08-09 22:21:55 +00005325
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005327 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005328 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005329 /* First check for possible aligned read of a C 'long'. Unaligned
5330 reads are more expensive, better to defer to another iteration. */
5331 if (!((size_t) q & LONG_PTR_MASK)) {
5332 /* Fast path for runs of non-surrogate chars. */
5333 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 int kind = PyUnicode_KIND(unicode);
5335 void *data = PyUnicode_DATA(unicode);
5336 while (_q < aligned_end) {
5337 unsigned long block = * (unsigned long *) _q;
5338 unsigned short *pblock = (unsigned short*)&block;
5339 Py_UCS4 maxch;
5340 if (native_ordering) {
5341 /* Can use buffer directly */
5342 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005344 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005345 else {
5346 /* Need to byte-swap */
5347 unsigned char *_p = (unsigned char*)pblock;
5348 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005349 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005350 _p[0] = _q[1];
5351 _p[1] = _q[0];
5352 _p[2] = _q[3];
5353 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005354#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005355 _p[4] = _q[5];
5356 _p[5] = _q[4];
5357 _p[6] = _q[7];
5358 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005359#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005360 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005361 maxch = Py_MAX(pblock[0], pblock[1]);
5362#if SIZEOF_LONG == 8
5363 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5364#endif
5365 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5366 if (unicode_widen(&unicode, maxch) < 0)
5367 goto onError;
5368 kind = PyUnicode_KIND(unicode);
5369 data = PyUnicode_DATA(unicode);
5370 }
5371 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5372 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5373#if SIZEOF_LONG == 8
5374 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5375 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5376#endif
5377 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005379 q = _q;
5380 if (q >= e)
5381 break;
5382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384
Benjamin Peterson14339b62009-01-31 16:36:08 +00005385 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005386
Victor Stinner551ac952011-11-29 22:58:13 +01005387 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005388 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5389 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 continue;
5391 }
5392
5393 /* UTF-16 code pair: */
5394 if (q > e) {
5395 errmsg = "unexpected end of data";
5396 startinpos = (((const char *)q) - 2) - starts;
5397 endinpos = ((const char *)e) + 1 - starts;
5398 goto utf16Error;
5399 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005400 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5401 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005403 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005404 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005405 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005406 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 continue;
5408 }
5409 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005410 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 startinpos = (((const char *)q)-4)-starts;
5412 endinpos = startinpos+2;
5413 goto utf16Error;
5414 }
5415
Benjamin Peterson14339b62009-01-31 16:36:08 +00005416 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 errmsg = "illegal encoding";
5418 startinpos = (((const char *)q)-2)-starts;
5419 endinpos = startinpos+2;
5420 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005421
Benjamin Peterson29060642009-01-31 22:14:21 +00005422 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005424 errors,
5425 &errorHandler,
5426 "utf16", errmsg,
5427 &starts,
5428 (const char **)&e,
5429 &startinpos,
5430 &endinpos,
5431 &exc,
5432 (const char **)&q,
5433 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005434 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005437 /* remaining byte at the end? (size should be even) */
5438 if (e == q) {
5439 if (!consumed) {
5440 errmsg = "truncated data";
5441 startinpos = ((const char *)q) - starts;
5442 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005443 if (unicode_decode_call_errorhandler(
5444 errors,
5445 &errorHandler,
5446 "utf16", errmsg,
5447 &starts,
5448 (const char **)&e,
5449 &startinpos,
5450 &endinpos,
5451 &exc,
5452 (const char **)&q,
5453 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005454 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005455 goto onError;
5456 /* The remaining input chars are ignored if the callback
5457 chooses to skip the input */
5458 }
5459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
5461 if (byteorder)
5462 *byteorder = bo;
5463
Walter Dörwald69652032004-09-07 20:24:22 +00005464 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005466
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005468 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 goto onError;
5470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 Py_XDECREF(errorHandler);
5472 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005473 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005477 Py_XDECREF(errorHandler);
5478 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 return NULL;
5480}
5481
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#undef FAST_CHAR_MASK
5483#undef SWAPPED_FAST_CHAR_MASK
5484
Tim Peters772747b2001-08-09 22:21:55 +00005485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005486_PyUnicode_EncodeUTF16(PyObject *str,
5487 const char *errors,
5488 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005490 int kind;
5491 void *data;
5492 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005493 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005494 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005495 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005496 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005497 /* Offsets from p for storing byte pairs in the right order. */
5498#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5499 int ihi = 1, ilo = 0;
5500#else
5501 int ihi = 0, ilo = 1;
5502#endif
5503
Benjamin Peterson29060642009-01-31 22:14:21 +00005504#define STORECHAR(CH) \
5505 do { \
5506 p[ihi] = ((CH) >> 8) & 0xff; \
5507 p[ilo] = (CH) & 0xff; \
5508 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005509 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005511 if (!PyUnicode_Check(str)) {
5512 PyErr_BadArgument();
5513 return NULL;
5514 }
5515 if (PyUnicode_READY(str) < 0)
5516 return NULL;
5517 kind = PyUnicode_KIND(str);
5518 data = PyUnicode_DATA(str);
5519 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005520
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005521 pairs = 0;
5522 if (kind == PyUnicode_4BYTE_KIND)
5523 for (i = 0; i < len; i++)
5524 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5525 pairs++;
5526 /* 2 * (len + pairs + (byteorder == 0)) */
5527 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005529 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005530 bytesize = nsize * 2;
5531 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005533 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534 if (v == NULL)
5535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005537 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005540 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005541 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005542
5543 if (byteorder == -1) {
5544 /* force LE */
5545 ihi = 1;
5546 ilo = 0;
5547 }
5548 else if (byteorder == 1) {
5549 /* force BE */
5550 ihi = 0;
5551 ilo = 1;
5552 }
5553
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005554 for (i = 0; i < len; i++) {
5555 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5556 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005558 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5559 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 }
Tim Peters772747b2001-08-09 22:21:55 +00005561 STORECHAR(ch);
5562 if (ch2)
5563 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005564 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005565
5566 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005567 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005568#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569}
5570
Alexander Belopolsky40018472011-02-26 01:02:56 +00005571PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005572PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5573 Py_ssize_t size,
5574 const char *errors,
5575 int byteorder)
5576{
5577 PyObject *result;
5578 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5579 if (tmp == NULL)
5580 return NULL;
5581 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5582 Py_DECREF(tmp);
5583 return result;
5584}
5585
5586PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005587PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005589 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590}
5591
5592/* --- Unicode Escape Codec ----------------------------------------------- */
5593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5595 if all the escapes in the string make it still a valid ASCII string.
5596 Returns -1 if any escapes were found which cause the string to
5597 pop out of ASCII range. Otherwise returns the length of the
5598 required buffer to hold the string.
5599 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005600static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005601length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5602{
5603 const unsigned char *p = (const unsigned char *)s;
5604 const unsigned char *end = p + size;
5605 Py_ssize_t length = 0;
5606
5607 if (size < 0)
5608 return -1;
5609
5610 for (; p < end; ++p) {
5611 if (*p > 127) {
5612 /* Non-ASCII */
5613 return -1;
5614 }
5615 else if (*p != '\\') {
5616 /* Normal character */
5617 ++length;
5618 }
5619 else {
5620 /* Backslash-escape, check next char */
5621 ++p;
5622 /* Escape sequence reaches till end of string or
5623 non-ASCII follow-up. */
5624 if (p >= end || *p > 127)
5625 return -1;
5626 switch (*p) {
5627 case '\n':
5628 /* backslash + \n result in zero characters */
5629 break;
5630 case '\\': case '\'': case '\"':
5631 case 'b': case 'f': case 't':
5632 case 'n': case 'r': case 'v': case 'a':
5633 ++length;
5634 break;
5635 case '0': case '1': case '2': case '3':
5636 case '4': case '5': case '6': case '7':
5637 case 'x': case 'u': case 'U': case 'N':
5638 /* these do not guarantee ASCII characters */
5639 return -1;
5640 default:
5641 /* count the backslash + the other character */
5642 length += 2;
5643 }
5644 }
5645 }
5646 return length;
5647}
5648
Fredrik Lundh06d12682001-01-24 07:59:11 +00005649static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005650
Alexander Belopolsky40018472011-02-26 01:02:56 +00005651PyObject *
5652PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005653 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005657 Py_ssize_t startinpos;
5658 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005660 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005662 char* message;
5663 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 PyObject *errorHandler = NULL;
5665 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005666 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005668
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005669 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670
5671 /* After length_of_escaped_ascii_string() there are two alternatives,
5672 either the string is pure ASCII with named escapes like \n, etc.
5673 and we determined it's exact size (common case)
5674 or it contains \x, \u, ... escape sequences. then we create a
5675 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 if (len >= 0) {
5677 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 if (!v)
5679 goto onError;
5680 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005681 }
5682 else {
5683 /* Escaped strings will always be longer than the resulting
5684 Unicode string, so we start with size here and then reduce the
5685 length after conversion to the true value.
5686 (but if the error callback returns a long replacement string
5687 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005688 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 if (!v)
5690 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005691 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 }
5693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005695 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 while (s < end) {
5700 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005701 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005704 /* The only case in which i == ascii_length is a backslash
5705 followed by a newline. */
5706 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005707
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 /* Non-escape characters are interpreted as Unicode ordinals */
5709 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5711 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 continue;
5713 }
5714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 /* \ - Escapes */
5717 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005718 c = *s++;
5719 if (s > end)
5720 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005721
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 /* The only case in which i == ascii_length is a backslash
5723 followed by a newline. */
5724 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005725
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005726 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729#define WRITECHAR(ch) \
5730 do { \
5731 if (unicode_putchar(&v, &i, ch) < 0) \
5732 goto onError; \
5733 }while(0)
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 case '\\': WRITECHAR('\\'); break;
5737 case '\'': WRITECHAR('\''); break;
5738 case '\"': WRITECHAR('\"'); break;
5739 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 case 'f': WRITECHAR('\014'); break;
5742 case 't': WRITECHAR('\t'); break;
5743 case 'n': WRITECHAR('\n'); break;
5744 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005746 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005747 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 case '0': case '1': case '2': case '3':
5752 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005753 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005754 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005755 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005756 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005757 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005759 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 break;
5761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* hex escapes */
5763 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005765 digits = 2;
5766 message = "truncated \\xXX escape";
5767 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771 digits = 4;
5772 message = "truncated \\uXXXX escape";
5773 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005776 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005777 digits = 8;
5778 message = "truncated \\UXXXXXXXX escape";
5779 hexescape:
5780 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (s+digits>end) {
5782 endinpos = size;
5783 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 errors, &errorHandler,
5785 "unicodeescape", "end of string in escape sequence",
5786 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005787 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 goto onError;
5789 goto nextByte;
5790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 for (j = 0; j < digits; ++j) {
5792 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005793 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 errors, &errorHandler,
5797 "unicodeescape", message,
5798 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005799 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005800 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005801 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005803 }
5804 chr = (chr<<4) & ~0xF;
5805 if (c >= '0' && c <= '9')
5806 chr += c - '0';
5807 else if (c >= 'a' && c <= 'f')
5808 chr += 10 + c - 'a';
5809 else
5810 chr += 10 + c - 'A';
5811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005812 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005813 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 /* _decoding_error will have already written into the
5815 target buffer. */
5816 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005817 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005818 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005819 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005820 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005821 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 errors, &errorHandler,
5825 "unicodeescape", "illegal Unicode character",
5826 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005828 goto onError;
5829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005852 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005853 goto store;
5854 }
5855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 errors, &errorHandler,
5859 "unicodeescape", message,
5860 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005861 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005862 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005863 break;
5864
5865 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005866 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 message = "\\ at end of string";
5868 s--;
5869 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 errors, &errorHandler,
5872 "unicodeescape", message,
5873 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005874 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005875 goto onError;
5876 }
5877 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005878 WRITECHAR('\\');
5879 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005880 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005881 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005886#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887
Victor Stinner16e6a802011-12-12 13:24:15 +01005888 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005889 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005890 Py_XDECREF(errorHandler);
5891 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005892 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005893
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005895 PyErr_SetString(
5896 PyExc_UnicodeError,
5897 "\\N escapes not supported (can't load unicodedata module)"
5898 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005899 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 Py_XDECREF(errorHandler);
5901 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005902 return NULL;
5903
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 Py_XDECREF(errorHandler);
5907 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 return NULL;
5909}
5910
5911/* Return a Unicode-Escape string version of the Unicode object.
5912
5913 If quotes is true, the string is enclosed in u"" or u'' quotes as
5914 appropriate.
5915
5916*/
5917
Alexander Belopolsky40018472011-02-26 01:02:56 +00005918PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005921 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005922 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005924 int kind;
5925 void *data;
5926 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927
Thomas Wouters89f507f2006-12-13 04:49:30 +00005928 /* Initial allocation is based on the longest-possible unichr
5929 escape.
5930
5931 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5932 unichr, so in this case it's the longest unichr escape. In
5933 narrow (UTF-16) builds this is five chars per source unichr
5934 since there are two unichrs in the surrogate pair, so in narrow
5935 (UTF-16) builds it's not the longest unichr escape.
5936
5937 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5938 so in the narrow (UTF-16) build case it's the longest unichr
5939 escape.
5940 */
5941
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005942 if (!PyUnicode_Check(unicode)) {
5943 PyErr_BadArgument();
5944 return NULL;
5945 }
5946 if (PyUnicode_READY(unicode) < 0)
5947 return NULL;
5948 len = PyUnicode_GET_LENGTH(unicode);
5949 kind = PyUnicode_KIND(unicode);
5950 data = PyUnicode_DATA(unicode);
5951 switch(kind) {
5952 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5953 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5954 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5955 }
5956
5957 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005958 return PyBytes_FromStringAndSize(NULL, 0);
5959
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005960 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005961 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005962
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005963 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005965 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 if (repr == NULL)
5968 return NULL;
5969
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005973 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005974
Walter Dörwald79e913e2007-05-12 11:08:06 +00005975 /* Escape backslashes */
5976 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 *p++ = '\\';
5978 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005979 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005980 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005981
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005982 /* Map 21-bit characters to '\U00xxxxxx' */
5983 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005984 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 *p++ = '\\';
5986 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005987 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5988 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5989 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5990 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5991 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5994 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005996 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005999 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = '\\';
6001 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006002 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6003 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6004 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6005 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006007
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 /* Map special whitespace to '\t', \n', '\r' */
6009 else if (ch == '\t') {
6010 *p++ = '\\';
6011 *p++ = 't';
6012 }
6013 else if (ch == '\n') {
6014 *p++ = '\\';
6015 *p++ = 'n';
6016 }
6017 else if (ch == '\r') {
6018 *p++ = '\\';
6019 *p++ = 'r';
6020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006021
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006022 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006023 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006025 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006026 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6027 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006028 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006029
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 /* Copy everything else as-is */
6031 else
6032 *p++ = (char) ch;
6033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 assert(p - PyBytes_AS_STRING(repr) > 0);
6036 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037 return NULL;
6038 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039}
6040
Alexander Belopolsky40018472011-02-26 01:02:56 +00006041PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6043 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006045 PyObject *result;
6046 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6047 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006049 result = PyUnicode_AsUnicodeEscapeString(tmp);
6050 Py_DECREF(tmp);
6051 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
Alexander Belopolsky40018472011-02-26 01:02:56 +00006056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006058 Py_ssize_t size,
6059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006062 Py_ssize_t startinpos;
6063 Py_ssize_t endinpos;
6064 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006065 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 const char *end;
6067 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 PyObject *errorHandler = NULL;
6069 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 /* Escaped strings will always be longer than the resulting
6072 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073 length after conversion to the true value. (But decoding error
6074 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006075 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006079 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006080 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 end = s + size;
6082 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 unsigned char c;
6084 Py_UCS4 x;
6085 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006086 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 /* Non-escape characters are interpreted as Unicode ordinals */
6089 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006093 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 startinpos = s-starts;
6095
6096 /* \u-escapes are only interpreted iff the number of leading
6097 backslashes if odd */
6098 bs = s;
6099 for (;s < end;) {
6100 if (*s != '\\')
6101 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6103 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 }
6105 if (((s - bs) & 1) == 0 ||
6106 s >= end ||
6107 (*s != 'u' && *s != 'U')) {
6108 continue;
6109 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006110 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 count = *s=='u' ? 4 : 8;
6112 s++;
6113
6114 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 for (x = 0, i = 0; i < count; ++i, ++s) {
6116 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006117 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 endinpos = s-starts;
6119 if (unicode_decode_call_errorhandler(
6120 errors, &errorHandler,
6121 "rawunicodeescape", "truncated \\uXXXX",
6122 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006123 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 goto onError;
6125 goto nextByte;
6126 }
6127 x = (x<<4) & ~0xF;
6128 if (c >= '0' && c <= '9')
6129 x += c - '0';
6130 else if (c >= 'a' && c <= 'f')
6131 x += 10 + c - 'a';
6132 else
6133 x += 10 + c - 'A';
6134 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006135 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006136 if (unicode_putchar(&v, &outpos, x) < 0)
6137 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006138 } else {
6139 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006140 if (unicode_decode_call_errorhandler(
6141 errors, &errorHandler,
6142 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006144 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 nextByte:
6148 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006150 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 Py_XDECREF(errorHandler);
6153 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006154 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006155
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_XDECREF(errorHandler);
6159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 return NULL;
6161}
6162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006167 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 char *p;
6169 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 Py_ssize_t expandsize, pos;
6171 int kind;
6172 void *data;
6173 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 if (!PyUnicode_Check(unicode)) {
6176 PyErr_BadArgument();
6177 return NULL;
6178 }
6179 if (PyUnicode_READY(unicode) < 0)
6180 return NULL;
6181 kind = PyUnicode_KIND(unicode);
6182 data = PyUnicode_DATA(unicode);
6183 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006184 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6185 bytes, and 1 byte characters 4. */
6186 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006187
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006190
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 if (repr == NULL)
6193 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006194 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006197 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006198 for (pos = 0; pos < len; pos++) {
6199 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* Map 32-bit characters to '\Uxxxxxxxx' */
6201 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006202 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006203 *p++ = '\\';
6204 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006205 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6206 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6207 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6208 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 *p++ = '\\';
6217 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006218 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6219 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6220 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6221 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 /* Copy everything else as-is */
6224 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 *p++ = (char) ch;
6226 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006227
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 assert(p > q);
6229 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006230 return NULL;
6231 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232}
6233
Alexander Belopolsky40018472011-02-26 01:02:56 +00006234PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6236 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006238 PyObject *result;
6239 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6240 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006241 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006242 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6243 Py_DECREF(tmp);
6244 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245}
6246
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247/* --- Unicode Internal Codec ------------------------------------------- */
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 Py_ssize_t size,
6252 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253{
6254 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006255 Py_ssize_t startinpos;
6256 Py_ssize_t endinpos;
6257 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006258 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006259 const char *end;
6260 const char *reason;
6261 PyObject *errorHandler = NULL;
6262 PyObject *exc = NULL;
6263
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006264 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006265 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006266 1))
6267 return NULL;
6268
Thomas Wouters89f507f2006-12-13 04:49:30 +00006269 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006270 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006271 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006273 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006274 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006275 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006276 end = s + size;
6277
6278 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006279 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006280 Py_UCS4 ch;
6281 /* We copy the raw representation one byte at a time because the
6282 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006283 ((char *) &uch)[0] = s[0];
6284 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006285#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006286 ((char *) &uch)[2] = s[2];
6287 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006288#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006289 ch = uch;
6290
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 /* We have to sanity check the raw data, otherwise doom looms for
6292 some malformed UCS-4 data. */
6293 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006294#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006295 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006296#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006297 end-s < Py_UNICODE_SIZE
6298 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 startinpos = s - starts;
6301 if (end-s < Py_UNICODE_SIZE) {
6302 endinpos = end-starts;
6303 reason = "truncated input";
6304 }
6305 else {
6306 endinpos = s - starts + Py_UNICODE_SIZE;
6307 reason = "illegal code point (> 0x10FFFF)";
6308 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006309 if (unicode_decode_call_errorhandler(
6310 errors, &errorHandler,
6311 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006312 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006313 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006314 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006315 continue;
6316 }
6317
6318 s += Py_UNICODE_SIZE;
6319#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006320 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006321 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006322 Py_UNICODE uch2;
6323 ((char *) &uch2)[0] = s[0];
6324 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006325 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006326 {
Victor Stinner551ac952011-11-29 22:58:13 +01006327 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006328 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 }
6330 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006331#endif
6332
6333 if (unicode_putchar(&v, &outpos, ch) < 0)
6334 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335 }
6336
Victor Stinner16e6a802011-12-12 13:24:15 +01006337 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006338 goto onError;
6339 Py_XDECREF(errorHandler);
6340 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006341 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006342
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006344 Py_XDECREF(v);
6345 Py_XDECREF(errorHandler);
6346 Py_XDECREF(exc);
6347 return NULL;
6348}
6349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350/* --- Latin-1 Codec ------------------------------------------------------ */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006358 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359}
6360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006362static void
6363make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006364 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006365 PyObject *unicode,
6366 Py_ssize_t startpos, Py_ssize_t endpos,
6367 const char *reason)
6368{
6369 if (*exceptionObject == NULL) {
6370 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006371 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006372 encoding, unicode, startpos, endpos, reason);
6373 }
6374 else {
6375 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6376 goto onError;
6377 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6378 goto onError;
6379 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6380 goto onError;
6381 return;
6382 onError:
6383 Py_DECREF(*exceptionObject);
6384 *exceptionObject = NULL;
6385 }
6386}
6387
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006389static void
6390raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006391 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006392 PyObject *unicode,
6393 Py_ssize_t startpos, Py_ssize_t endpos,
6394 const char *reason)
6395{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006396 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006397 encoding, unicode, startpos, endpos, reason);
6398 if (*exceptionObject != NULL)
6399 PyCodec_StrictErrors(*exceptionObject);
6400}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401
6402/* error handling callback helper:
6403 build arguments, call the callback and check the arguments,
6404 put the result into newpos and return the replacement string, which
6405 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006406static PyObject *
6407unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006408 PyObject **errorHandler,
6409 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006410 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006411 Py_ssize_t startpos, Py_ssize_t endpos,
6412 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006414 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 PyObject *restuple;
6417 PyObject *resunicode;
6418
6419 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 }
6424
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425 if (PyUnicode_READY(unicode) < 0)
6426 return NULL;
6427 len = PyUnicode_GET_LENGTH(unicode);
6428
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006429 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006430 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433
6434 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006440 Py_DECREF(restuple);
6441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006443 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 &resunicode, newpos)) {
6445 Py_DECREF(restuple);
6446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006448 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6449 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6450 Py_DECREF(restuple);
6451 return NULL;
6452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006454 *newpos = len + *newpos;
6455 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6457 Py_DECREF(restuple);
6458 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 Py_INCREF(resunicode);
6461 Py_DECREF(restuple);
6462 return resunicode;
6463}
6464
Alexander Belopolsky40018472011-02-26 01:02:56 +00006465static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006467 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006468 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006470 /* input state */
6471 Py_ssize_t pos=0, size;
6472 int kind;
6473 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 /* output object */
6475 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476 /* pointer into the output */
6477 char *str;
6478 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006480 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6481 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 PyObject *errorHandler = NULL;
6483 PyObject *exc = NULL;
6484 /* the following variable is used for caching string comparisons
6485 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6486 int known_errorHandler = -1;
6487
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006488 if (PyUnicode_READY(unicode) < 0)
6489 return NULL;
6490 size = PyUnicode_GET_LENGTH(unicode);
6491 kind = PyUnicode_KIND(unicode);
6492 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 /* allocate enough for a simple encoding without
6494 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006495 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006496 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006497 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006499 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006500 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 ressize = size;
6502
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 while (pos < size) {
6504 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 /* can we encode this? */
6507 if (c<limit) {
6508 /* no overflow check, because we know that the space is enough */
6509 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006511 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 Py_ssize_t requiredsize;
6514 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006515 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006517 Py_ssize_t collstart = pos;
6518 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 ++collend;
6522 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6523 if (known_errorHandler==-1) {
6524 if ((errors==NULL) || (!strcmp(errors, "strict")))
6525 known_errorHandler = 1;
6526 else if (!strcmp(errors, "replace"))
6527 known_errorHandler = 2;
6528 else if (!strcmp(errors, "ignore"))
6529 known_errorHandler = 3;
6530 else if (!strcmp(errors, "xmlcharrefreplace"))
6531 known_errorHandler = 4;
6532 else
6533 known_errorHandler = 0;
6534 }
6535 switch (known_errorHandler) {
6536 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006537 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 goto onError;
6539 case 2: /* replace */
6540 while (collstart++<collend)
6541 *str++ = '?'; /* fall through */
6542 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 break;
6545 case 4: /* xmlcharrefreplace */
6546 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 /* determine replacement size */
6548 for (i = collstart, repsize = 0; i < collend; ++i) {
6549 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6550 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006562 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006563 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006567 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 if (requiredsize > ressize) {
6569 if (requiredsize<2*ressize)
6570 requiredsize = 2*ressize;
6571 if (_PyBytes_Resize(&res, requiredsize))
6572 goto onError;
6573 str = PyBytes_AS_STRING(res) + respos;
6574 ressize = requiredsize;
6575 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 /* generate replacement */
6577 for (i = collstart; i < collend; ++i) {
6578 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 break;
6582 default:
6583 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 encoding, reason, unicode, &exc,
6585 collstart, collend, &newpos);
6586 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6587 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 if (PyBytes_Check(repunicode)) {
6590 /* Directly copy bytes result to output. */
6591 repsize = PyBytes_Size(repunicode);
6592 if (repsize > 1) {
6593 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006594 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006595 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6596 Py_DECREF(repunicode);
6597 goto onError;
6598 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006599 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006600 ressize += repsize-1;
6601 }
6602 memcpy(str, PyBytes_AsString(repunicode), repsize);
6603 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006604 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006605 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006606 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 /* need more space? (at least enough for what we
6609 have+the replacement+the rest of the string, so
6610 we won't have to check space for encodable characters) */
6611 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 repsize = PyUnicode_GET_LENGTH(repunicode);
6613 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 if (requiredsize > ressize) {
6615 if (requiredsize<2*ressize)
6616 requiredsize = 2*ressize;
6617 if (_PyBytes_Resize(&res, requiredsize)) {
6618 Py_DECREF(repunicode);
6619 goto onError;
6620 }
6621 str = PyBytes_AS_STRING(res) + respos;
6622 ressize = requiredsize;
6623 }
6624 /* check if there is anything unencodable in the replacement
6625 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006626 for (i = 0; repsize-->0; ++i, ++str) {
6627 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006629 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 Py_DECREF(repunicode);
6632 goto onError;
6633 }
6634 *str = (char)c;
6635 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006636 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006637 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006639 }
6640 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006641 /* Resize if we allocated to much */
6642 size = str - PyBytes_AS_STRING(res);
6643 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006644 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006645 if (_PyBytes_Resize(&res, size) < 0)
6646 goto onError;
6647 }
6648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649 Py_XDECREF(errorHandler);
6650 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006651 return res;
6652
6653 onError:
6654 Py_XDECREF(res);
6655 Py_XDECREF(errorHandler);
6656 Py_XDECREF(exc);
6657 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006658}
6659
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661PyObject *
6662PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 Py_ssize_t size,
6664 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 PyObject *result;
6667 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6668 if (unicode == NULL)
6669 return NULL;
6670 result = unicode_encode_ucs1(unicode, errors, 256);
6671 Py_DECREF(unicode);
6672 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673}
6674
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677{
6678 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006679 PyErr_BadArgument();
6680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006682 if (PyUnicode_READY(unicode) == -1)
6683 return NULL;
6684 /* Fast path: if it is a one-byte string, construct
6685 bytes object directly. */
6686 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6687 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6688 PyUnicode_GET_LENGTH(unicode));
6689 /* Non-Latin-1 characters present. Defer to above function to
6690 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006692}
6693
6694PyObject*
6695PyUnicode_AsLatin1String(PyObject *unicode)
6696{
6697 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698}
6699
6700/* --- 7-bit ASCII Codec -------------------------------------------------- */
6701
Alexander Belopolsky40018472011-02-26 01:02:56 +00006702PyObject *
6703PyUnicode_DecodeASCII(const char *s,
6704 Py_ssize_t size,
6705 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006706{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006708 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006709 int kind;
6710 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006711 Py_ssize_t startinpos;
6712 Py_ssize_t endinpos;
6713 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006715 int has_error;
6716 const unsigned char *p = (const unsigned char *)s;
6717 const unsigned char *end = p + size;
6718 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 PyObject *errorHandler = NULL;
6720 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006721
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006722 if (size == 0) {
6723 Py_INCREF(unicode_empty);
6724 return unicode_empty;
6725 }
6726
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006728 if (size == 1 && (unsigned char)s[0] < 128)
6729 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006730
Victor Stinner702c7342011-10-05 13:50:52 +02006731 has_error = 0;
6732 while (p < end && !has_error) {
6733 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6734 an explanation. */
6735 if (!((size_t) p & LONG_PTR_MASK)) {
6736 /* Help register allocation */
6737 register const unsigned char *_p = p;
6738 while (_p < aligned_end) {
6739 unsigned long value = *(unsigned long *) _p;
6740 if (value & ASCII_CHAR_MASK) {
6741 has_error = 1;
6742 break;
6743 }
6744 _p += SIZEOF_LONG;
6745 }
6746 if (_p == end)
6747 break;
6748 if (has_error)
6749 break;
6750 p = _p;
6751 }
6752 if (*p & 0x80) {
6753 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006754 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006755 }
6756 else {
6757 ++p;
6758 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006759 }
Victor Stinner702c7342011-10-05 13:50:52 +02006760 if (!has_error)
6761 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006762
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006763 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006765 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006767 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 kind = PyUnicode_KIND(v);
6769 data = PyUnicode_DATA(v);
6770 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771 e = s + size;
6772 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 register unsigned char c = (unsigned char)*s;
6774 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006775 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 ++s;
6777 }
6778 else {
6779 startinpos = s-starts;
6780 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 if (unicode_decode_call_errorhandler(
6782 errors, &errorHandler,
6783 "ascii", "ordinal not in range(128)",
6784 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006785 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006787 kind = PyUnicode_KIND(v);
6788 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006791 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006792 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793 Py_XDECREF(errorHandler);
6794 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006795 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006796 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006800 Py_XDECREF(errorHandler);
6801 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 return NULL;
6803}
6804
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006806PyObject *
6807PyUnicode_EncodeASCII(const Py_UNICODE *p,
6808 Py_ssize_t size,
6809 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 PyObject *result;
6812 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6813 if (unicode == NULL)
6814 return NULL;
6815 result = unicode_encode_ucs1(unicode, errors, 128);
6816 Py_DECREF(unicode);
6817 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Alexander Belopolsky40018472011-02-26 01:02:56 +00006820PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006821_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822{
6823 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 PyErr_BadArgument();
6825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006827 if (PyUnicode_READY(unicode) == -1)
6828 return NULL;
6829 /* Fast path: if it is an ASCII-only string, construct bytes object
6830 directly. Else defer to above function to raise the exception. */
6831 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6832 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6833 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006835}
6836
6837PyObject *
6838PyUnicode_AsASCIIString(PyObject *unicode)
6839{
6840 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841}
6842
Victor Stinner99b95382011-07-04 14:23:54 +02006843#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006844
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006845/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006846
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006847#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848#define NEED_RETRY
6849#endif
6850
Victor Stinner3a50e702011-10-18 21:21:00 +02006851#ifndef WC_ERR_INVALID_CHARS
6852# define WC_ERR_INVALID_CHARS 0x0080
6853#endif
6854
6855static char*
6856code_page_name(UINT code_page, PyObject **obj)
6857{
6858 *obj = NULL;
6859 if (code_page == CP_ACP)
6860 return "mbcs";
6861 if (code_page == CP_UTF7)
6862 return "CP_UTF7";
6863 if (code_page == CP_UTF8)
6864 return "CP_UTF8";
6865
6866 *obj = PyBytes_FromFormat("cp%u", code_page);
6867 if (*obj == NULL)
6868 return NULL;
6869 return PyBytes_AS_STRING(*obj);
6870}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
Alexander Belopolsky40018472011-02-26 01:02:56 +00006872static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006873is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874{
6875 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006876 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
Victor Stinner3a50e702011-10-18 21:21:00 +02006878 if (!IsDBCSLeadByteEx(code_page, *curr))
6879 return 0;
6880
6881 prev = CharPrevExA(code_page, s, curr, 0);
6882 if (prev == curr)
6883 return 1;
6884 /* FIXME: This code is limited to "true" double-byte encodings,
6885 as it assumes an incomplete character consists of a single
6886 byte. */
6887 if (curr - prev == 2)
6888 return 1;
6889 if (!IsDBCSLeadByteEx(code_page, *prev))
6890 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891 return 0;
6892}
6893
Victor Stinner3a50e702011-10-18 21:21:00 +02006894static DWORD
6895decode_code_page_flags(UINT code_page)
6896{
6897 if (code_page == CP_UTF7) {
6898 /* The CP_UTF7 decoder only supports flags=0 */
6899 return 0;
6900 }
6901 else
6902 return MB_ERR_INVALID_CHARS;
6903}
6904
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 * Decode a byte string from a Windows code page into unicode object in strict
6907 * mode.
6908 *
6909 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6910 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006913decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 const char *in,
6916 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917{
Victor Stinner3a50e702011-10-18 21:21:00 +02006918 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006919 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006921
6922 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 assert(insize > 0);
6924 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6925 if (outsize <= 0)
6926 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927
6928 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006930 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 if (*v == NULL)
6932 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 }
6935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006938 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006941 }
6942
6943 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6945 if (outsize <= 0)
6946 goto error;
6947 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949error:
6950 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6951 return -2;
6952 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006953 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954}
6955
Victor Stinner3a50e702011-10-18 21:21:00 +02006956/*
6957 * Decode a byte string from a code page into unicode object with an error
6958 * handler.
6959 *
6960 * Returns consumed size if succeed, or raise a WindowsError or
6961 * UnicodeDecodeError exception and returns -1 on error.
6962 */
6963static int
6964decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006965 PyObject **v,
6966 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 const char *errors)
6968{
6969 const char *startin = in;
6970 const char *endin = in + size;
6971 const DWORD flags = decode_code_page_flags(code_page);
6972 /* Ideally, we should get reason from FormatMessage. This is the Windows
6973 2000 English version of the message. */
6974 const char *reason = "No mapping for the Unicode character exists "
6975 "in the target code page.";
6976 /* each step cannot decode more than 1 character, but a character can be
6977 represented as a surrogate pair */
6978 wchar_t buffer[2], *startout, *out;
6979 int insize, outsize;
6980 PyObject *errorHandler = NULL;
6981 PyObject *exc = NULL;
6982 PyObject *encoding_obj = NULL;
6983 char *encoding;
6984 DWORD err;
6985 int ret = -1;
6986
6987 assert(size > 0);
6988
6989 encoding = code_page_name(code_page, &encoding_obj);
6990 if (encoding == NULL)
6991 return -1;
6992
6993 if (errors == NULL || strcmp(errors, "strict") == 0) {
6994 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6995 UnicodeDecodeError. */
6996 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6997 if (exc != NULL) {
6998 PyCodec_StrictErrors(exc);
6999 Py_CLEAR(exc);
7000 }
7001 goto error;
7002 }
7003
7004 if (*v == NULL) {
7005 /* Create unicode object */
7006 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7007 PyErr_NoMemory();
7008 goto error;
7009 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007010 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 if (*v == NULL)
7012 goto error;
7013 startout = PyUnicode_AS_UNICODE(*v);
7014 }
7015 else {
7016 /* Extend unicode object */
7017 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7018 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7019 PyErr_NoMemory();
7020 goto error;
7021 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007022 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 goto error;
7024 startout = PyUnicode_AS_UNICODE(*v) + n;
7025 }
7026
7027 /* Decode the byte string character per character */
7028 out = startout;
7029 while (in < endin)
7030 {
7031 /* Decode a character */
7032 insize = 1;
7033 do
7034 {
7035 outsize = MultiByteToWideChar(code_page, flags,
7036 in, insize,
7037 buffer, Py_ARRAY_LENGTH(buffer));
7038 if (outsize > 0)
7039 break;
7040 err = GetLastError();
7041 if (err != ERROR_NO_UNICODE_TRANSLATION
7042 && err != ERROR_INSUFFICIENT_BUFFER)
7043 {
7044 PyErr_SetFromWindowsErr(0);
7045 goto error;
7046 }
7047 insize++;
7048 }
7049 /* 4=maximum length of a UTF-8 sequence */
7050 while (insize <= 4 && (in + insize) <= endin);
7051
7052 if (outsize <= 0) {
7053 Py_ssize_t startinpos, endinpos, outpos;
7054
7055 startinpos = in - startin;
7056 endinpos = startinpos + 1;
7057 outpos = out - PyUnicode_AS_UNICODE(*v);
7058 if (unicode_decode_call_errorhandler(
7059 errors, &errorHandler,
7060 encoding, reason,
7061 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007062 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 {
7064 goto error;
7065 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007066 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 }
7068 else {
7069 in += insize;
7070 memcpy(out, buffer, outsize * sizeof(wchar_t));
7071 out += outsize;
7072 }
7073 }
7074
7075 /* write a NUL character at the end */
7076 *out = 0;
7077
7078 /* Extend unicode object */
7079 outsize = out - startout;
7080 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007081 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007083 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084
7085error:
7086 Py_XDECREF(encoding_obj);
7087 Py_XDECREF(errorHandler);
7088 Py_XDECREF(exc);
7089 return ret;
7090}
7091
Victor Stinner3a50e702011-10-18 21:21:00 +02007092static PyObject *
7093decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007094 const char *s, Py_ssize_t size,
7095 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096{
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 PyObject *v = NULL;
7098 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 if (code_page < 0) {
7101 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7102 return NULL;
7103 }
7104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107
Victor Stinner76a31a62011-11-04 00:05:13 +01007108 do
7109 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 if (size > INT_MAX) {
7112 chunk_size = INT_MAX;
7113 final = 0;
7114 done = 0;
7115 }
7116 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 {
7119 chunk_size = (int)size;
7120 final = (consumed == NULL);
7121 done = 1;
7122 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007123
Victor Stinner76a31a62011-11-04 00:05:13 +01007124 /* Skip trailing lead-byte unless 'final' is set */
7125 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7126 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127
Victor Stinner76a31a62011-11-04 00:05:13 +01007128 if (chunk_size == 0 && done) {
7129 if (v != NULL)
7130 break;
7131 Py_INCREF(unicode_empty);
7132 return unicode_empty;
7133 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134
Victor Stinner76a31a62011-11-04 00:05:13 +01007135
7136 converted = decode_code_page_strict(code_page, &v,
7137 s, chunk_size);
7138 if (converted == -2)
7139 converted = decode_code_page_errors(code_page, &v,
7140 s, chunk_size,
7141 errors);
7142 assert(converted != 0);
7143
7144 if (converted < 0) {
7145 Py_XDECREF(v);
7146 return NULL;
7147 }
7148
7149 if (consumed)
7150 *consumed += converted;
7151
7152 s += converted;
7153 size -= converted;
7154 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007155
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007156 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157}
7158
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007160PyUnicode_DecodeCodePageStateful(int code_page,
7161 const char *s,
7162 Py_ssize_t size,
7163 const char *errors,
7164 Py_ssize_t *consumed)
7165{
7166 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7167}
7168
7169PyObject *
7170PyUnicode_DecodeMBCSStateful(const char *s,
7171 Py_ssize_t size,
7172 const char *errors,
7173 Py_ssize_t *consumed)
7174{
7175 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7176}
7177
7178PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007179PyUnicode_DecodeMBCS(const char *s,
7180 Py_ssize_t size,
7181 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007182{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7184}
7185
Victor Stinner3a50e702011-10-18 21:21:00 +02007186static DWORD
7187encode_code_page_flags(UINT code_page, const char *errors)
7188{
7189 if (code_page == CP_UTF8) {
7190 if (winver.dwMajorVersion >= 6)
7191 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7192 and later */
7193 return WC_ERR_INVALID_CHARS;
7194 else
7195 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7196 return 0;
7197 }
7198 else if (code_page == CP_UTF7) {
7199 /* CP_UTF7 only supports flags=0 */
7200 return 0;
7201 }
7202 else {
7203 if (errors != NULL && strcmp(errors, "replace") == 0)
7204 return 0;
7205 else
7206 return WC_NO_BEST_FIT_CHARS;
7207 }
7208}
7209
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 * Encode a Unicode string to a Windows code page into a byte string in strict
7212 * mode.
7213 *
7214 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7215 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007216 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007217static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007218encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007219 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007221{
Victor Stinner554f3f02010-06-16 23:33:54 +00007222 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007223 BOOL *pusedDefaultChar = &usedDefaultChar;
7224 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007225 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007226 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 const DWORD flags = encode_code_page_flags(code_page, NULL);
7229 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007230 /* Create a substring so that we can get the UTF-16 representation
7231 of just the slice under consideration. */
7232 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233
Martin v. Löwis3d325192011-11-04 18:23:06 +01007234 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007237 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007239 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007240
Victor Stinner2fc507f2011-11-04 20:06:39 +01007241 substring = PyUnicode_Substring(unicode, offset, offset+len);
7242 if (substring == NULL)
7243 return -1;
7244 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7245 if (p == NULL) {
7246 Py_DECREF(substring);
7247 return -1;
7248 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007249
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007250 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 outsize = WideCharToMultiByte(code_page, flags,
7252 p, size,
7253 NULL, 0,
7254 NULL, pusedDefaultChar);
7255 if (outsize <= 0)
7256 goto error;
7257 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007258 if (pusedDefaultChar && *pusedDefaultChar) {
7259 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007261 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007266 if (*outbytes == NULL) {
7267 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271 }
7272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 const Py_ssize_t n = PyBytes_Size(*outbytes);
7275 if (outsize > PY_SSIZE_T_MAX - n) {
7276 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007277 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007279 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007280 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7281 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007283 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007285 }
7286
7287 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 outsize = WideCharToMultiByte(code_page, flags,
7289 p, size,
7290 out, outsize,
7291 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007292 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 if (outsize <= 0)
7294 goto error;
7295 if (pusedDefaultChar && *pusedDefaultChar)
7296 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007297 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007298
Victor Stinner3a50e702011-10-18 21:21:00 +02007299error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007300 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7302 return -2;
7303 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007304 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007305}
7306
Victor Stinner3a50e702011-10-18 21:21:00 +02007307/*
7308 * Encode a Unicode string to a Windows code page into a byte string using a
7309 * error handler.
7310 *
7311 * Returns consumed characters if succeed, or raise a WindowsError and returns
7312 * -1 on other error.
7313 */
7314static int
7315encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007316 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007317 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007318{
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007320 Py_ssize_t pos = unicode_offset;
7321 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 /* Ideally, we should get reason from FormatMessage. This is the Windows
7323 2000 English version of the message. */
7324 const char *reason = "invalid character";
7325 /* 4=maximum length of a UTF-8 sequence */
7326 char buffer[4];
7327 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7328 Py_ssize_t outsize;
7329 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 PyObject *errorHandler = NULL;
7331 PyObject *exc = NULL;
7332 PyObject *encoding_obj = NULL;
7333 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007334 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007335 PyObject *rep;
7336 int ret = -1;
7337
7338 assert(insize > 0);
7339
7340 encoding = code_page_name(code_page, &encoding_obj);
7341 if (encoding == NULL)
7342 return -1;
7343
7344 if (errors == NULL || strcmp(errors, "strict") == 0) {
7345 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7346 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007347 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 if (exc != NULL) {
7349 PyCodec_StrictErrors(exc);
7350 Py_DECREF(exc);
7351 }
7352 Py_XDECREF(encoding_obj);
7353 return -1;
7354 }
7355
7356 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7357 pusedDefaultChar = &usedDefaultChar;
7358 else
7359 pusedDefaultChar = NULL;
7360
7361 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7362 PyErr_NoMemory();
7363 goto error;
7364 }
7365 outsize = insize * Py_ARRAY_LENGTH(buffer);
7366
7367 if (*outbytes == NULL) {
7368 /* Create string object */
7369 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7370 if (*outbytes == NULL)
7371 goto error;
7372 out = PyBytes_AS_STRING(*outbytes);
7373 }
7374 else {
7375 /* Extend string object */
7376 Py_ssize_t n = PyBytes_Size(*outbytes);
7377 if (n > PY_SSIZE_T_MAX - outsize) {
7378 PyErr_NoMemory();
7379 goto error;
7380 }
7381 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7382 goto error;
7383 out = PyBytes_AS_STRING(*outbytes) + n;
7384 }
7385
7386 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007389 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7390 wchar_t chars[2];
7391 int charsize;
7392 if (ch < 0x10000) {
7393 chars[0] = (wchar_t)ch;
7394 charsize = 1;
7395 }
7396 else {
7397 ch -= 0x10000;
7398 chars[0] = 0xd800 + (ch >> 10);
7399 chars[1] = 0xdc00 + (ch & 0x3ff);
7400 charsize = 2;
7401 }
7402
Victor Stinner3a50e702011-10-18 21:21:00 +02007403 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007404 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 buffer, Py_ARRAY_LENGTH(buffer),
7406 NULL, pusedDefaultChar);
7407 if (outsize > 0) {
7408 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7409 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007410 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 memcpy(out, buffer, outsize);
7412 out += outsize;
7413 continue;
7414 }
7415 }
7416 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7417 PyErr_SetFromWindowsErr(0);
7418 goto error;
7419 }
7420
Victor Stinner3a50e702011-10-18 21:21:00 +02007421 rep = unicode_encode_call_errorhandler(
7422 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007423 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007424 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 if (rep == NULL)
7426 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007427 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007428
7429 if (PyBytes_Check(rep)) {
7430 outsize = PyBytes_GET_SIZE(rep);
7431 if (outsize != 1) {
7432 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7433 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7434 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7435 Py_DECREF(rep);
7436 goto error;
7437 }
7438 out = PyBytes_AS_STRING(*outbytes) + offset;
7439 }
7440 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7441 out += outsize;
7442 }
7443 else {
7444 Py_ssize_t i;
7445 enum PyUnicode_Kind kind;
7446 void *data;
7447
7448 if (PyUnicode_READY(rep) < 0) {
7449 Py_DECREF(rep);
7450 goto error;
7451 }
7452
7453 outsize = PyUnicode_GET_LENGTH(rep);
7454 if (outsize != 1) {
7455 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7456 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7457 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7458 Py_DECREF(rep);
7459 goto error;
7460 }
7461 out = PyBytes_AS_STRING(*outbytes) + offset;
7462 }
7463 kind = PyUnicode_KIND(rep);
7464 data = PyUnicode_DATA(rep);
7465 for (i=0; i < outsize; i++) {
7466 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7467 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007468 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 encoding, unicode,
7470 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 "unable to encode error handler result to ASCII");
7472 Py_DECREF(rep);
7473 goto error;
7474 }
7475 *out = (unsigned char)ch;
7476 out++;
7477 }
7478 }
7479 Py_DECREF(rep);
7480 }
7481 /* write a NUL byte */
7482 *out = 0;
7483 outsize = out - PyBytes_AS_STRING(*outbytes);
7484 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7485 if (_PyBytes_Resize(outbytes, outsize) < 0)
7486 goto error;
7487 ret = 0;
7488
7489error:
7490 Py_XDECREF(encoding_obj);
7491 Py_XDECREF(errorHandler);
7492 Py_XDECREF(exc);
7493 return ret;
7494}
7495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496static PyObject *
7497encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007498 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 const char *errors)
7500{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007501 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007503 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007504 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007505
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 if (PyUnicode_READY(unicode) < 0)
7507 return NULL;
7508 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007509
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 if (code_page < 0) {
7511 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7512 return NULL;
7513 }
7514
Martin v. Löwis3d325192011-11-04 18:23:06 +01007515 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 return PyBytes_FromStringAndSize(NULL, 0);
7517
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 offset = 0;
7519 do
7520 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007523 chunks. */
7524 if (len > INT_MAX/2) {
7525 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 done = 0;
7527 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007529#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007530 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007532 done = 1;
7533 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007534
Victor Stinner76a31a62011-11-04 00:05:13 +01007535 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007536 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007537 errors);
7538 if (ret == -2)
7539 ret = encode_code_page_errors(code_page, &outbytes,
7540 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007541 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007542 if (ret < 0) {
7543 Py_XDECREF(outbytes);
7544 return NULL;
7545 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546
Victor Stinner7581cef2011-11-03 22:32:33 +01007547 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007548 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007549 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007550
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 return outbytes;
7552}
7553
7554PyObject *
7555PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7556 Py_ssize_t size,
7557 const char *errors)
7558{
Victor Stinner7581cef2011-11-03 22:32:33 +01007559 PyObject *unicode, *res;
7560 unicode = PyUnicode_FromUnicode(p, size);
7561 if (unicode == NULL)
7562 return NULL;
7563 res = encode_code_page(CP_ACP, unicode, errors);
7564 Py_DECREF(unicode);
7565 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566}
7567
7568PyObject *
7569PyUnicode_EncodeCodePage(int code_page,
7570 PyObject *unicode,
7571 const char *errors)
7572{
Victor Stinner7581cef2011-11-03 22:32:33 +01007573 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007574}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007575
Alexander Belopolsky40018472011-02-26 01:02:56 +00007576PyObject *
7577PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007578{
7579 if (!PyUnicode_Check(unicode)) {
7580 PyErr_BadArgument();
7581 return NULL;
7582 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007583 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007584}
7585
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586#undef NEED_RETRY
7587
Victor Stinner99b95382011-07-04 14:23:54 +02007588#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007589
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590/* --- Character Mapping Codec -------------------------------------------- */
7591
Alexander Belopolsky40018472011-02-26 01:02:56 +00007592PyObject *
7593PyUnicode_DecodeCharmap(const char *s,
7594 Py_ssize_t size,
7595 PyObject *mapping,
7596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007599 Py_ssize_t startinpos;
7600 Py_ssize_t endinpos;
7601 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007602 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007603 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007604 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007605 PyObject *errorHandler = NULL;
7606 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007607
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 /* Default to Latin-1 */
7609 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007612 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007616 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007617 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007619 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007620 Py_ssize_t maplen;
7621 enum PyUnicode_Kind kind;
7622 void *data;
7623 Py_UCS4 x;
7624
7625 if (PyUnicode_READY(mapping) < 0)
7626 return NULL;
7627
7628 maplen = PyUnicode_GET_LENGTH(mapping);
7629 data = PyUnicode_DATA(mapping);
7630 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 while (s < e) {
7632 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007635 x = PyUnicode_READ(kind, data, ch);
7636 else
7637 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007639 if (x == 0xfffe)
7640 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 startinpos = s-starts;
7643 endinpos = startinpos+1;
7644 if (unicode_decode_call_errorhandler(
7645 errors, &errorHandler,
7646 "charmap", "character maps to <undefined>",
7647 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007648 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 goto onError;
7650 }
7651 continue;
7652 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007653
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007654 if (unicode_putchar(&v, &outpos, x) < 0)
7655 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007657 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007658 }
7659 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 while (s < e) {
7661 unsigned char ch = *s;
7662 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007663
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7665 w = PyLong_FromLong((long)ch);
7666 if (w == NULL)
7667 goto onError;
7668 x = PyObject_GetItem(mapping, w);
7669 Py_DECREF(w);
7670 if (x == NULL) {
7671 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7672 /* No mapping found means: mapping is undefined. */
7673 PyErr_Clear();
7674 x = Py_None;
7675 Py_INCREF(x);
7676 } else
7677 goto onError;
7678 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007679
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 /* Apply mapping */
7681 if (PyLong_Check(x)) {
7682 long value = PyLong_AS_LONG(x);
7683 if (value < 0 || value > 65535) {
7684 PyErr_SetString(PyExc_TypeError,
7685 "character mapping must be in range(65536)");
7686 Py_DECREF(x);
7687 goto onError;
7688 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007689 if (unicode_putchar(&v, &outpos, value) < 0)
7690 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 }
7692 else if (x == Py_None) {
7693 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 startinpos = s-starts;
7695 endinpos = startinpos+1;
7696 if (unicode_decode_call_errorhandler(
7697 errors, &errorHandler,
7698 "charmap", "character maps to <undefined>",
7699 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007700 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 Py_DECREF(x);
7702 goto onError;
7703 }
7704 Py_DECREF(x);
7705 continue;
7706 }
7707 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007708 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007709
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 if (PyUnicode_READY(x) < 0)
7711 goto onError;
7712 targetsize = PyUnicode_GET_LENGTH(x);
7713
7714 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007716 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007717 PyUnicode_READ_CHAR(x, 0)) < 0)
7718 goto onError;
7719 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 else if (targetsize > 1) {
7721 /* 1-n mapping */
7722 if (targetsize > extrachars) {
7723 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 Py_ssize_t needed = (targetsize - extrachars) + \
7725 (targetsize << 2);
7726 extrachars += needed;
7727 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007728 if (unicode_resize(&v,
7729 PyUnicode_GET_LENGTH(v) + needed) < 0)
7730 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 Py_DECREF(x);
7732 goto onError;
7733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007735 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7736 goto onError;
7737 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7738 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 extrachars -= targetsize;
7740 }
7741 /* 1-0 mapping: skip the character */
7742 }
7743 else {
7744 /* wrong return value */
7745 PyErr_SetString(PyExc_TypeError,
7746 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007747 Py_DECREF(x);
7748 goto onError;
7749 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007750 Py_DECREF(x);
7751 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007754 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007755 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 Py_XDECREF(errorHandler);
7757 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007758 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007759
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007761 Py_XDECREF(errorHandler);
7762 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 Py_XDECREF(v);
7764 return NULL;
7765}
7766
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767/* Charmap encoding: the lookup table */
7768
Alexander Belopolsky40018472011-02-26 01:02:56 +00007769struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyObject_HEAD
7771 unsigned char level1[32];
7772 int count2, count3;
7773 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774};
7775
7776static PyObject*
7777encoding_map_size(PyObject *obj, PyObject* args)
7778{
7779 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782}
7783
7784static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007786 PyDoc_STR("Return the size (in bytes) of this object") },
7787 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007788};
7789
7790static void
7791encoding_map_dealloc(PyObject* o)
7792{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794}
7795
7796static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 "EncodingMap", /*tp_name*/
7799 sizeof(struct encoding_map), /*tp_basicsize*/
7800 0, /*tp_itemsize*/
7801 /* methods */
7802 encoding_map_dealloc, /*tp_dealloc*/
7803 0, /*tp_print*/
7804 0, /*tp_getattr*/
7805 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007806 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 0, /*tp_repr*/
7808 0, /*tp_as_number*/
7809 0, /*tp_as_sequence*/
7810 0, /*tp_as_mapping*/
7811 0, /*tp_hash*/
7812 0, /*tp_call*/
7813 0, /*tp_str*/
7814 0, /*tp_getattro*/
7815 0, /*tp_setattro*/
7816 0, /*tp_as_buffer*/
7817 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7818 0, /*tp_doc*/
7819 0, /*tp_traverse*/
7820 0, /*tp_clear*/
7821 0, /*tp_richcompare*/
7822 0, /*tp_weaklistoffset*/
7823 0, /*tp_iter*/
7824 0, /*tp_iternext*/
7825 encoding_map_methods, /*tp_methods*/
7826 0, /*tp_members*/
7827 0, /*tp_getset*/
7828 0, /*tp_base*/
7829 0, /*tp_dict*/
7830 0, /*tp_descr_get*/
7831 0, /*tp_descr_set*/
7832 0, /*tp_dictoffset*/
7833 0, /*tp_init*/
7834 0, /*tp_alloc*/
7835 0, /*tp_new*/
7836 0, /*tp_free*/
7837 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838};
7839
7840PyObject*
7841PyUnicode_BuildEncodingMap(PyObject* string)
7842{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 PyObject *result;
7844 struct encoding_map *mresult;
7845 int i;
7846 int need_dict = 0;
7847 unsigned char level1[32];
7848 unsigned char level2[512];
7849 unsigned char *mlevel1, *mlevel2, *mlevel3;
7850 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 int kind;
7852 void *data;
7853 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007856 PyErr_BadArgument();
7857 return NULL;
7858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 kind = PyUnicode_KIND(string);
7860 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 memset(level1, 0xFF, sizeof level1);
7862 memset(level2, 0xFF, sizeof level2);
7863
7864 /* If there isn't a one-to-one mapping of NULL to \0,
7865 or if there are non-BMP characters, we need to use
7866 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007867 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007868 need_dict = 1;
7869 for (i = 1; i < 256; i++) {
7870 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 ch = PyUnicode_READ(kind, data, i);
7872 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 need_dict = 1;
7874 break;
7875 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877 /* unmapped character */
7878 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 l1 = ch >> 11;
7880 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 if (level1[l1] == 0xFF)
7882 level1[l1] = count2++;
7883 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007884 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 }
7886
7887 if (count2 >= 0xFF || count3 >= 0xFF)
7888 need_dict = 1;
7889
7890 if (need_dict) {
7891 PyObject *result = PyDict_New();
7892 PyObject *key, *value;
7893 if (!result)
7894 return NULL;
7895 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007897 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007898 if (!key || !value)
7899 goto failed1;
7900 if (PyDict_SetItem(result, key, value) == -1)
7901 goto failed1;
7902 Py_DECREF(key);
7903 Py_DECREF(value);
7904 }
7905 return result;
7906 failed1:
7907 Py_XDECREF(key);
7908 Py_XDECREF(value);
7909 Py_DECREF(result);
7910 return NULL;
7911 }
7912
7913 /* Create a three-level trie */
7914 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7915 16*count2 + 128*count3 - 1);
7916 if (!result)
7917 return PyErr_NoMemory();
7918 PyObject_Init(result, &EncodingMapType);
7919 mresult = (struct encoding_map*)result;
7920 mresult->count2 = count2;
7921 mresult->count3 = count3;
7922 mlevel1 = mresult->level1;
7923 mlevel2 = mresult->level23;
7924 mlevel3 = mresult->level23 + 16*count2;
7925 memcpy(mlevel1, level1, 32);
7926 memset(mlevel2, 0xFF, 16*count2);
7927 memset(mlevel3, 0, 128*count3);
7928 count3 = 0;
7929 for (i = 1; i < 256; i++) {
7930 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 /* unmapped character */
7933 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 o1 = PyUnicode_READ(kind, data, i)>>11;
7935 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 i2 = 16*mlevel1[o1] + o2;
7937 if (mlevel2[i2] == 0xFF)
7938 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007939 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940 i3 = 128*mlevel2[i2] + o3;
7941 mlevel3[i3] = i;
7942 }
7943 return result;
7944}
7945
7946static int
Victor Stinner22168992011-11-20 17:09:18 +01007947encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948{
7949 struct encoding_map *map = (struct encoding_map*)mapping;
7950 int l1 = c>>11;
7951 int l2 = (c>>7) & 0xF;
7952 int l3 = c & 0x7F;
7953 int i;
7954
Victor Stinner22168992011-11-20 17:09:18 +01007955 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007957 if (c == 0)
7958 return 0;
7959 /* level 1*/
7960 i = map->level1[l1];
7961 if (i == 0xFF) {
7962 return -1;
7963 }
7964 /* level 2*/
7965 i = map->level23[16*i+l2];
7966 if (i == 0xFF) {
7967 return -1;
7968 }
7969 /* level 3 */
7970 i = map->level23[16*map->count2 + 128*i + l3];
7971 if (i == 0) {
7972 return -1;
7973 }
7974 return i;
7975}
7976
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977/* Lookup the character ch in the mapping. If the character
7978 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007979 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007980static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007981charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
Christian Heimes217cfd12007-12-02 14:31:20 +00007983 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 PyObject *x;
7985
7986 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 x = PyObject_GetItem(mapping, w);
7989 Py_DECREF(w);
7990 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7992 /* No mapping found means: mapping is undefined. */
7993 PyErr_Clear();
7994 x = Py_None;
7995 Py_INCREF(x);
7996 return x;
7997 } else
7998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008000 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008002 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 long value = PyLong_AS_LONG(x);
8004 if (value < 0 || value > 255) {
8005 PyErr_SetString(PyExc_TypeError,
8006 "character mapping must be in range(256)");
8007 Py_DECREF(x);
8008 return NULL;
8009 }
8010 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008012 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 /* wrong return value */
8016 PyErr_Format(PyExc_TypeError,
8017 "character mapping must return integer, bytes or None, not %.400s",
8018 x->ob_type->tp_name);
8019 Py_DECREF(x);
8020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 }
8022}
8023
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008025charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8028 /* exponentially overallocate to minimize reallocations */
8029 if (requiredsize < 2*outsize)
8030 requiredsize = 2*outsize;
8031 if (_PyBytes_Resize(outobj, requiredsize))
8032 return -1;
8033 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034}
8035
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008038} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008039/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008040 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 space is available. Return a new reference to the object that
8042 was put in the output buffer, or Py_None, if the mapping was undefined
8043 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008044 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008046charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 PyObject *rep;
8050 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008051 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052
Christian Heimes90aa7642007-12-19 02:45:37 +00008053 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 if (res == -1)
8057 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 if (outsize<requiredsize)
8059 if (charmapencode_resize(outobj, outpos, requiredsize))
8060 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008061 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 outstart[(*outpos)++] = (char)res;
8063 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 }
8065
8066 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008067 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 Py_DECREF(rep);
8071 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008072 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 if (PyLong_Check(rep)) {
8074 Py_ssize_t requiredsize = *outpos+1;
8075 if (outsize<requiredsize)
8076 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8077 Py_DECREF(rep);
8078 return enc_EXCEPTION;
8079 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008080 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 else {
8084 const char *repchars = PyBytes_AS_STRING(rep);
8085 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8086 Py_ssize_t requiredsize = *outpos+repsize;
8087 if (outsize<requiredsize)
8088 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8089 Py_DECREF(rep);
8090 return enc_EXCEPTION;
8091 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008092 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 memcpy(outstart + *outpos, repchars, repsize);
8094 *outpos += repsize;
8095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097 Py_DECREF(rep);
8098 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099}
8100
8101/* handle an error in PyUnicode_EncodeCharmap
8102 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008103static int
8104charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008105 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008107 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008108 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109{
8110 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008113 enum PyUnicode_Kind kind;
8114 void *data;
8115 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117 Py_ssize_t collstartpos = *inpos;
8118 Py_ssize_t collendpos = *inpos+1;
8119 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 char *encoding = "charmap";
8121 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008123 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008124 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 if (PyUnicode_READY(unicode) < 0)
8127 return -1;
8128 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 /* find all unencodable characters */
8130 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008131 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008132 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008133 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008134 val = encoding_map_lookup(ch, mapping);
8135 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 break;
8137 ++collendpos;
8138 continue;
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008141 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8142 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 if (rep==NULL)
8144 return -1;
8145 else if (rep!=Py_None) {
8146 Py_DECREF(rep);
8147 break;
8148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 }
8152 /* cache callback name lookup
8153 * (if not done yet, i.e. it's the first error) */
8154 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 if ((errors==NULL) || (!strcmp(errors, "strict")))
8156 *known_errorHandler = 1;
8157 else if (!strcmp(errors, "replace"))
8158 *known_errorHandler = 2;
8159 else if (!strcmp(errors, "ignore"))
8160 *known_errorHandler = 3;
8161 else if (!strcmp(errors, "xmlcharrefreplace"))
8162 *known_errorHandler = 4;
8163 else
8164 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 }
8166 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008168 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 return -1;
8170 case 2: /* replace */
8171 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 x = charmapencode_output('?', mapping, res, respos);
8173 if (x==enc_EXCEPTION) {
8174 return -1;
8175 }
8176 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008177 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return -1;
8179 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 }
8181 /* fall through */
8182 case 3: /* ignore */
8183 *inpos = collendpos;
8184 break;
8185 case 4: /* xmlcharrefreplace */
8186 /* generate replacement (temporarily (mis)uses p) */
8187 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 char buffer[2+29+1+1];
8189 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008190 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 for (cp = buffer; *cp; ++cp) {
8192 x = charmapencode_output(*cp, mapping, res, respos);
8193 if (x==enc_EXCEPTION)
8194 return -1;
8195 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008196 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return -1;
8198 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 }
8200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 *inpos = collendpos;
8202 break;
8203 default:
8204 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008205 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008209 if (PyBytes_Check(repunicode)) {
8210 /* Directly copy bytes result to output. */
8211 Py_ssize_t outsize = PyBytes_Size(*res);
8212 Py_ssize_t requiredsize;
8213 repsize = PyBytes_Size(repunicode);
8214 requiredsize = *respos + repsize;
8215 if (requiredsize > outsize)
8216 /* Make room for all additional bytes. */
8217 if (charmapencode_resize(res, respos, requiredsize)) {
8218 Py_DECREF(repunicode);
8219 return -1;
8220 }
8221 memcpy(PyBytes_AsString(*res) + *respos,
8222 PyBytes_AsString(repunicode), repsize);
8223 *respos += repsize;
8224 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008225 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008226 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008227 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008228 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008229 if (PyUnicode_READY(repunicode) < 0) {
8230 Py_DECREF(repunicode);
8231 return -1;
8232 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008233 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008234 data = PyUnicode_DATA(repunicode);
8235 kind = PyUnicode_KIND(repunicode);
8236 for (index = 0; index < repsize; index++) {
8237 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8238 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008240 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
8242 }
8243 else if (x==enc_FAILED) {
8244 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008245 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 return -1;
8247 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248 }
8249 *inpos = newpos;
8250 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 }
8252 return 0;
8253}
8254
Alexander Belopolsky40018472011-02-26 01:02:56 +00008255PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256_PyUnicode_EncodeCharmap(PyObject *unicode,
8257 PyObject *mapping,
8258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 /* output object */
8261 PyObject *res = NULL;
8262 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008263 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008264 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008266 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 PyObject *errorHandler = NULL;
8268 PyObject *exc = NULL;
8269 /* the following variable is used for caching string comparisons
8270 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8271 * 3=ignore, 4=xmlcharrefreplace */
8272 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008274 if (PyUnicode_READY(unicode) < 0)
8275 return NULL;
8276 size = PyUnicode_GET_LENGTH(unicode);
8277
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 /* Default to Latin-1 */
8279 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008280 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 /* allocate enough for a simple encoding without
8283 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008284 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 if (res == NULL)
8286 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008287 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008291 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008293 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 if (x==enc_EXCEPTION) /* error */
8295 goto onError;
8296 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 &exc,
8299 &known_errorHandler, &errorHandler, errors,
8300 &res, &respos)) {
8301 goto onError;
8302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 else
8305 /* done with this character => adjust input position */
8306 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008310 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008311 if (_PyBytes_Resize(&res, respos) < 0)
8312 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314 Py_XDECREF(exc);
8315 Py_XDECREF(errorHandler);
8316 return res;
8317
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008319 Py_XDECREF(res);
8320 Py_XDECREF(exc);
8321 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 return NULL;
8323}
8324
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325/* Deprecated */
8326PyObject *
8327PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8328 Py_ssize_t size,
8329 PyObject *mapping,
8330 const char *errors)
8331{
8332 PyObject *result;
8333 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8334 if (unicode == NULL)
8335 return NULL;
8336 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8337 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008338 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008339}
8340
Alexander Belopolsky40018472011-02-26 01:02:56 +00008341PyObject *
8342PyUnicode_AsCharmapString(PyObject *unicode,
8343 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
8345 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 PyErr_BadArgument();
8347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008349 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350}
8351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353static void
8354make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356 Py_ssize_t startpos, Py_ssize_t endpos,
8357 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360 *exceptionObject = _PyUnicodeTranslateError_Create(
8361 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 }
8363 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8365 goto onError;
8366 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8367 goto onError;
8368 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8369 goto onError;
8370 return;
8371 onError:
8372 Py_DECREF(*exceptionObject);
8373 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 }
8375}
8376
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static void
8379raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381 Py_ssize_t startpos, Py_ssize_t endpos,
8382 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383{
8384 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388}
8389
8390/* error handling callback helper:
8391 build arguments, call the callback and check the arguments,
8392 put the result into newpos and return the replacement string, which
8393 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008394static PyObject *
8395unicode_translate_call_errorhandler(const char *errors,
8396 PyObject **errorHandler,
8397 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008399 Py_ssize_t startpos, Py_ssize_t endpos,
8400 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008402 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008404 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 PyObject *restuple;
8406 PyObject *resunicode;
8407
8408 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413
8414 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418
8419 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008424 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 Py_DECREF(restuple);
8426 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 }
8428 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 &resunicode, &i_newpos)) {
8430 Py_DECREF(restuple);
8431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008433 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 else
8436 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8439 Py_DECREF(restuple);
8440 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008441 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 Py_INCREF(resunicode);
8443 Py_DECREF(restuple);
8444 return resunicode;
8445}
8446
8447/* Lookup the character ch in the mapping and put the result in result,
8448 which must be decrefed by the caller.
8449 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008450static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008452{
Christian Heimes217cfd12007-12-02 14:31:20 +00008453 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008454 PyObject *x;
8455
8456 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008458 x = PyObject_GetItem(mapping, w);
8459 Py_DECREF(w);
8460 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8462 /* No mapping found means: use 1:1 mapping. */
8463 PyErr_Clear();
8464 *result = NULL;
8465 return 0;
8466 } else
8467 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468 }
8469 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 *result = x;
8471 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008473 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 long value = PyLong_AS_LONG(x);
8475 long max = PyUnicode_GetMax();
8476 if (value < 0 || value > max) {
8477 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008478 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 Py_DECREF(x);
8480 return -1;
8481 }
8482 *result = x;
8483 return 0;
8484 }
8485 else if (PyUnicode_Check(x)) {
8486 *result = x;
8487 return 0;
8488 }
8489 else {
8490 /* wrong return value */
8491 PyErr_SetString(PyExc_TypeError,
8492 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 Py_DECREF(x);
8494 return -1;
8495 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496}
8497/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 if not reallocate and adjust various state variables.
8499 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008500static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008505 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 /* exponentially overallocate to minimize reallocations */
8507 if (requiredsize < 2 * oldsize)
8508 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8510 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513 }
8514 return 0;
8515}
8516/* lookup the character, put the result in the output string and adjust
8517 various state variables. Return a new reference to the object that
8518 was put in the output buffer in *result, or Py_None, if the mapping was
8519 undefined (in which case no character was written).
8520 The called must decref result.
8521 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008522static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8524 PyObject *mapping, Py_UCS4 **output,
8525 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8529 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 }
8535 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008537 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 }
8541 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 Py_ssize_t repsize;
8543 if (PyUnicode_READY(*res) == -1)
8544 return -1;
8545 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 if (repsize==1) {
8547 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 }
8550 else if (repsize!=0) {
8551 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008552 Py_ssize_t requiredsize = *opos +
8553 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 Py_ssize_t i;
8556 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 for(i = 0; i < repsize; i++)
8559 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 }
8562 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 return 0;
8565}
8566
Alexander Belopolsky40018472011-02-26 01:02:56 +00008567PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568_PyUnicode_TranslateCharmap(PyObject *input,
8569 PyObject *mapping,
8570 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 /* input object */
8573 char *idata;
8574 Py_ssize_t size, i;
8575 int kind;
8576 /* output buffer */
8577 Py_UCS4 *output = NULL;
8578 Py_ssize_t osize;
8579 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 char *reason = "character maps to <undefined>";
8583 PyObject *errorHandler = NULL;
8584 PyObject *exc = NULL;
8585 /* the following variable is used for caching string comparisons
8586 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8587 * 3=ignore, 4=xmlcharrefreplace */
8588 int known_errorHandler = -1;
8589
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 PyErr_BadArgument();
8592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (PyUnicode_READY(input) == -1)
8596 return NULL;
8597 idata = (char*)PyUnicode_DATA(input);
8598 kind = PyUnicode_KIND(input);
8599 size = PyUnicode_GET_LENGTH(input);
8600 i = 0;
8601
8602 if (size == 0) {
8603 Py_INCREF(input);
8604 return input;
8605 }
8606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 /* allocate enough for a simple 1:1 translation without
8608 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 osize = size;
8610 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8611 opos = 0;
8612 if (output == NULL) {
8613 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 /* try to encode it */
8619 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 if (charmaptranslate_output(input, i, mapping,
8621 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 Py_XDECREF(x);
8623 goto onError;
8624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008625 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 else { /* untranslatable character */
8629 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8630 Py_ssize_t repsize;
8631 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 Py_ssize_t collstart = i;
8635 Py_ssize_t collend = i+1;
8636 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 while (collend < size) {
8640 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008641 goto onError;
8642 Py_XDECREF(x);
8643 if (x!=Py_None)
8644 break;
8645 ++collend;
8646 }
8647 /* cache callback name lookup
8648 * (if not done yet, i.e. it's the first error) */
8649 if (known_errorHandler==-1) {
8650 if ((errors==NULL) || (!strcmp(errors, "strict")))
8651 known_errorHandler = 1;
8652 else if (!strcmp(errors, "replace"))
8653 known_errorHandler = 2;
8654 else if (!strcmp(errors, "ignore"))
8655 known_errorHandler = 3;
8656 else if (!strcmp(errors, "xmlcharrefreplace"))
8657 known_errorHandler = 4;
8658 else
8659 known_errorHandler = 0;
8660 }
8661 switch (known_errorHandler) {
8662 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 raise_translate_exception(&exc, input, collstart,
8664 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 case 2: /* replace */
8667 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 for (coll = collstart; coll<collend; coll++)
8669 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 /* fall through */
8671 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 break;
8674 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 /* generate replacement (temporarily (mis)uses i) */
8676 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 char buffer[2+29+1+1];
8678 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8680 if (charmaptranslate_makespace(&output, &osize,
8681 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 goto onError;
8683 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 break;
8688 default:
8689 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 reason, input, &exc,
8691 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008692 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008694 if (PyUnicode_READY(repunicode) < 0) {
8695 Py_DECREF(repunicode);
8696 goto onError;
8697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 repsize = PyUnicode_GET_LENGTH(repunicode);
8700 if (charmaptranslate_makespace(&output, &osize,
8701 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 Py_DECREF(repunicode);
8703 goto onError;
8704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 for (uni2 = 0; repsize-->0; ++uni2)
8706 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8707 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008709 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008710 }
8711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8713 if (!res)
8714 goto onError;
8715 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 Py_XDECREF(exc);
8717 Py_XDECREF(errorHandler);
8718 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008721 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 Py_XDECREF(exc);
8723 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 return NULL;
8725}
8726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727/* Deprecated. Use PyUnicode_Translate instead. */
8728PyObject *
8729PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8730 Py_ssize_t size,
8731 PyObject *mapping,
8732 const char *errors)
8733{
8734 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8735 if (!unicode)
8736 return NULL;
8737 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8738}
8739
Alexander Belopolsky40018472011-02-26 01:02:56 +00008740PyObject *
8741PyUnicode_Translate(PyObject *str,
8742 PyObject *mapping,
8743 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744{
8745 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008746
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 str = PyUnicode_FromObject(str);
8748 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008749 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 Py_DECREF(str);
8752 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008753
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755 Py_XDECREF(str);
8756 return NULL;
8757}
Tim Petersced69f82003-09-16 20:30:58 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008760fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761{
8762 /* No need to call PyUnicode_READY(self) because this function is only
8763 called as a callback from fixup() which does it already. */
8764 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8765 const int kind = PyUnicode_KIND(self);
8766 void *data = PyUnicode_DATA(self);
8767 Py_UCS4 maxchar = 0, ch, fixed;
8768 Py_ssize_t i;
8769
8770 for (i = 0; i < len; ++i) {
8771 ch = PyUnicode_READ(kind, data, i);
8772 fixed = 0;
8773 if (ch > 127) {
8774 if (Py_UNICODE_ISSPACE(ch))
8775 fixed = ' ';
8776 else {
8777 const int decimal = Py_UNICODE_TODECIMAL(ch);
8778 if (decimal >= 0)
8779 fixed = '0' + decimal;
8780 }
8781 if (fixed != 0) {
8782 if (fixed > maxchar)
8783 maxchar = fixed;
8784 PyUnicode_WRITE(kind, data, i, fixed);
8785 }
8786 else if (ch > maxchar)
8787 maxchar = ch;
8788 }
8789 else if (ch > maxchar)
8790 maxchar = ch;
8791 }
8792
8793 return maxchar;
8794}
8795
8796PyObject *
8797_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8798{
8799 if (!PyUnicode_Check(unicode)) {
8800 PyErr_BadInternalCall();
8801 return NULL;
8802 }
8803 if (PyUnicode_READY(unicode) == -1)
8804 return NULL;
8805 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8806 /* If the string is already ASCII, just return the same string */
8807 Py_INCREF(unicode);
8808 return unicode;
8809 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008810 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811}
8812
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008813PyObject *
8814PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8815 Py_ssize_t length)
8816{
Victor Stinnerf0124502011-11-21 23:12:56 +01008817 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008818 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008819 Py_UCS4 maxchar;
8820 enum PyUnicode_Kind kind;
8821 void *data;
8822
8823 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008824 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008825 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008826 if (ch > 127) {
8827 int decimal = Py_UNICODE_TODECIMAL(ch);
8828 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008829 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008830 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008831 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008832 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008833
8834 /* Copy to a new string */
8835 decimal = PyUnicode_New(length, maxchar);
8836 if (decimal == NULL)
8837 return decimal;
8838 kind = PyUnicode_KIND(decimal);
8839 data = PyUnicode_DATA(decimal);
8840 /* Iterate over code points */
8841 for (i = 0; i < length; i++) {
8842 Py_UNICODE ch = s[i];
8843 if (ch > 127) {
8844 int decimal = Py_UNICODE_TODECIMAL(ch);
8845 if (decimal >= 0)
8846 ch = '0' + decimal;
8847 }
8848 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008850 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008851}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008852/* --- Decimal Encoder ---------------------------------------------------- */
8853
Alexander Belopolsky40018472011-02-26 01:02:56 +00008854int
8855PyUnicode_EncodeDecimal(Py_UNICODE *s,
8856 Py_ssize_t length,
8857 char *output,
8858 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008859{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008860 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008861 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008862 enum PyUnicode_Kind kind;
8863 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864
8865 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 PyErr_BadArgument();
8867 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008868 }
8869
Victor Stinner42bf7752011-11-21 22:52:58 +01008870 unicode = PyUnicode_FromUnicode(s, length);
8871 if (unicode == NULL)
8872 return -1;
8873
Victor Stinner6345be92011-11-25 20:09:01 +01008874 if (PyUnicode_READY(unicode) < 0) {
8875 Py_DECREF(unicode);
8876 return -1;
8877 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008878 kind = PyUnicode_KIND(unicode);
8879 data = PyUnicode_DATA(unicode);
8880
Victor Stinnerb84d7232011-11-22 01:50:07 +01008881 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008882 PyObject *exc;
8883 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008885 Py_ssize_t startpos;
8886
8887 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008888
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008890 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008891 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 decimal = Py_UNICODE_TODECIMAL(ch);
8895 if (decimal >= 0) {
8896 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008897 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 continue;
8899 }
8900 if (0 < ch && ch < 256) {
8901 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008902 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 continue;
8904 }
Victor Stinner6345be92011-11-25 20:09:01 +01008905
Victor Stinner42bf7752011-11-21 22:52:58 +01008906 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008907 exc = NULL;
8908 raise_encode_exception(&exc, "decimal", unicode,
8909 startpos, startpos+1,
8910 "invalid decimal Unicode string");
8911 Py_XDECREF(exc);
8912 Py_DECREF(unicode);
8913 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008914 }
8915 /* 0-terminate the output string */
8916 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008917 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008918 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008919}
8920
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921/* --- Helpers ------------------------------------------------------------ */
8922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008924any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 Py_ssize_t start,
8926 Py_ssize_t end)
8927{
8928 int kind1, kind2, kind;
8929 void *buf1, *buf2;
8930 Py_ssize_t len1, len2, result;
8931
8932 kind1 = PyUnicode_KIND(s1);
8933 kind2 = PyUnicode_KIND(s2);
8934 kind = kind1 > kind2 ? kind1 : kind2;
8935 buf1 = PyUnicode_DATA(s1);
8936 buf2 = PyUnicode_DATA(s2);
8937 if (kind1 != kind)
8938 buf1 = _PyUnicode_AsKind(s1, kind);
8939 if (!buf1)
8940 return -2;
8941 if (kind2 != kind)
8942 buf2 = _PyUnicode_AsKind(s2, kind);
8943 if (!buf2) {
8944 if (kind1 != kind) PyMem_Free(buf1);
8945 return -2;
8946 }
8947 len1 = PyUnicode_GET_LENGTH(s1);
8948 len2 = PyUnicode_GET_LENGTH(s2);
8949
Victor Stinner794d5672011-10-10 03:21:36 +02008950 if (direction > 0) {
8951 switch(kind) {
8952 case PyUnicode_1BYTE_KIND:
8953 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8954 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8955 else
8956 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8957 break;
8958 case PyUnicode_2BYTE_KIND:
8959 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8960 break;
8961 case PyUnicode_4BYTE_KIND:
8962 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8963 break;
8964 default:
8965 assert(0); result = -2;
8966 }
8967 }
8968 else {
8969 switch(kind) {
8970 case PyUnicode_1BYTE_KIND:
8971 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8972 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8973 else
8974 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8975 break;
8976 case PyUnicode_2BYTE_KIND:
8977 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8978 break;
8979 case PyUnicode_4BYTE_KIND:
8980 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8981 break;
8982 default:
8983 assert(0); result = -2;
8984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 }
8986
8987 if (kind1 != kind)
8988 PyMem_Free(buf1);
8989 if (kind2 != kind)
8990 PyMem_Free(buf2);
8991
8992 return result;
8993}
8994
8995Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008996_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 Py_ssize_t n_buffer,
8998 void *digits, Py_ssize_t n_digits,
8999 Py_ssize_t min_width,
9000 const char *grouping,
9001 const char *thousands_sep)
9002{
9003 switch(kind) {
9004 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009005 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9006 return _PyUnicode_ascii_InsertThousandsGrouping(
9007 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9008 min_width, grouping, thousands_sep);
9009 else
9010 return _PyUnicode_ucs1_InsertThousandsGrouping(
9011 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9012 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 case PyUnicode_2BYTE_KIND:
9014 return _PyUnicode_ucs2_InsertThousandsGrouping(
9015 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9016 min_width, grouping, thousands_sep);
9017 case PyUnicode_4BYTE_KIND:
9018 return _PyUnicode_ucs4_InsertThousandsGrouping(
9019 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9020 min_width, grouping, thousands_sep);
9021 }
9022 assert(0);
9023 return -1;
9024}
9025
9026
Thomas Wouters477c8d52006-05-27 19:21:47 +00009027/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009028#define ADJUST_INDICES(start, end, len) \
9029 if (end > len) \
9030 end = len; \
9031 else if (end < 0) { \
9032 end += len; \
9033 if (end < 0) \
9034 end = 0; \
9035 } \
9036 if (start < 0) { \
9037 start += len; \
9038 if (start < 0) \
9039 start = 0; \
9040 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009041
Alexander Belopolsky40018472011-02-26 01:02:56 +00009042Py_ssize_t
9043PyUnicode_Count(PyObject *str,
9044 PyObject *substr,
9045 Py_ssize_t start,
9046 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009048 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009049 PyObject* str_obj;
9050 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 int kind1, kind2, kind;
9052 void *buf1 = NULL, *buf2 = NULL;
9053 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009054
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009055 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009058 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009059 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 Py_DECREF(str_obj);
9061 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062 }
Tim Petersced69f82003-09-16 20:30:58 +00009063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 kind1 = PyUnicode_KIND(str_obj);
9065 kind2 = PyUnicode_KIND(sub_obj);
9066 kind = kind1 > kind2 ? kind1 : kind2;
9067 buf1 = PyUnicode_DATA(str_obj);
9068 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009069 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 if (!buf1)
9071 goto onError;
9072 buf2 = PyUnicode_DATA(sub_obj);
9073 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009074 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 if (!buf2)
9076 goto onError;
9077 len1 = PyUnicode_GET_LENGTH(str_obj);
9078 len2 = PyUnicode_GET_LENGTH(sub_obj);
9079
9080 ADJUST_INDICES(start, end, len1);
9081 switch(kind) {
9082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009083 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9084 result = asciilib_count(
9085 ((Py_UCS1*)buf1) + start, end - start,
9086 buf2, len2, PY_SSIZE_T_MAX
9087 );
9088 else
9089 result = ucs1lib_count(
9090 ((Py_UCS1*)buf1) + start, end - start,
9091 buf2, len2, PY_SSIZE_T_MAX
9092 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 break;
9094 case PyUnicode_2BYTE_KIND:
9095 result = ucs2lib_count(
9096 ((Py_UCS2*)buf1) + start, end - start,
9097 buf2, len2, PY_SSIZE_T_MAX
9098 );
9099 break;
9100 case PyUnicode_4BYTE_KIND:
9101 result = ucs4lib_count(
9102 ((Py_UCS4*)buf1) + start, end - start,
9103 buf2, len2, PY_SSIZE_T_MAX
9104 );
9105 break;
9106 default:
9107 assert(0); result = 0;
9108 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009109
9110 Py_DECREF(sub_obj);
9111 Py_DECREF(str_obj);
9112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 if (kind1 != kind)
9114 PyMem_Free(buf1);
9115 if (kind2 != kind)
9116 PyMem_Free(buf2);
9117
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 onError:
9120 Py_DECREF(sub_obj);
9121 Py_DECREF(str_obj);
9122 if (kind1 != kind && buf1)
9123 PyMem_Free(buf1);
9124 if (kind2 != kind && buf2)
9125 PyMem_Free(buf2);
9126 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127}
9128
Alexander Belopolsky40018472011-02-26 01:02:56 +00009129Py_ssize_t
9130PyUnicode_Find(PyObject *str,
9131 PyObject *sub,
9132 Py_ssize_t start,
9133 Py_ssize_t end,
9134 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009136 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009137
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009141 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 Py_DECREF(str);
9144 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Tim Petersced69f82003-09-16 20:30:58 +00009146
Victor Stinner794d5672011-10-10 03:21:36 +02009147 result = any_find_slice(direction,
9148 str, sub, start, end
9149 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009152 Py_DECREF(sub);
9153
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 return result;
9155}
9156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157Py_ssize_t
9158PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9159 Py_ssize_t start, Py_ssize_t end,
9160 int direction)
9161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009163 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 if (PyUnicode_READY(str) == -1)
9165 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009166 if (start < 0 || end < 0) {
9167 PyErr_SetString(PyExc_IndexError, "string index out of range");
9168 return -2;
9169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 if (end > PyUnicode_GET_LENGTH(str))
9171 end = PyUnicode_GET_LENGTH(str);
9172 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009173 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9174 kind, end-start, ch, direction);
9175 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009177 else
9178 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179}
9180
Alexander Belopolsky40018472011-02-26 01:02:56 +00009181static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009182tailmatch(PyObject *self,
9183 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009184 Py_ssize_t start,
9185 Py_ssize_t end,
9186 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 int kind_self;
9189 int kind_sub;
9190 void *data_self;
9191 void *data_sub;
9192 Py_ssize_t offset;
9193 Py_ssize_t i;
9194 Py_ssize_t end_sub;
9195
9196 if (PyUnicode_READY(self) == -1 ||
9197 PyUnicode_READY(substring) == -1)
9198 return 0;
9199
9200 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 return 1;
9202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9204 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 kind_self = PyUnicode_KIND(self);
9209 data_self = PyUnicode_DATA(self);
9210 kind_sub = PyUnicode_KIND(substring);
9211 data_sub = PyUnicode_DATA(substring);
9212 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9213
9214 if (direction > 0)
9215 offset = end;
9216 else
9217 offset = start;
9218
9219 if (PyUnicode_READ(kind_self, data_self, offset) ==
9220 PyUnicode_READ(kind_sub, data_sub, 0) &&
9221 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9222 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9223 /* If both are of the same kind, memcmp is sufficient */
9224 if (kind_self == kind_sub) {
9225 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009226 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 data_sub,
9228 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009229 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 }
9231 /* otherwise we have to compare each character by first accesing it */
9232 else {
9233 /* We do not need to compare 0 and len(substring)-1 because
9234 the if statement above ensured already that they are equal
9235 when we end up here. */
9236 // TODO: honor direction and do a forward or backwards search
9237 for (i = 1; i < end_sub; ++i) {
9238 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9239 PyUnicode_READ(kind_sub, data_sub, i))
9240 return 0;
9241 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 }
9245
9246 return 0;
9247}
9248
Alexander Belopolsky40018472011-02-26 01:02:56 +00009249Py_ssize_t
9250PyUnicode_Tailmatch(PyObject *str,
9251 PyObject *substr,
9252 Py_ssize_t start,
9253 Py_ssize_t end,
9254 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009256 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258 str = PyUnicode_FromObject(str);
9259 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009260 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 substr = PyUnicode_FromObject(substr);
9262 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 Py_DECREF(str);
9264 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 }
Tim Petersced69f82003-09-16 20:30:58 +00009266
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009267 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009268 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 Py_DECREF(str);
9270 Py_DECREF(substr);
9271 return result;
9272}
9273
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274/* Apply fixfct filter to the Unicode object self and return a
9275 reference to the modified object */
9276
Alexander Belopolsky40018472011-02-26 01:02:56 +00009277static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009278fixup(PyObject *self,
9279 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 PyObject *u;
9282 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009283 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009285 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009287 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009288 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 /* fix functions return the new maximum character in a string,
9291 if the kind of the resulting unicode object does not change,
9292 everything is fine. Otherwise we need to change the string kind
9293 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009294 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009295
9296 if (maxchar_new == 0) {
9297 /* no changes */;
9298 if (PyUnicode_CheckExact(self)) {
9299 Py_DECREF(u);
9300 Py_INCREF(self);
9301 return self;
9302 }
9303 else
9304 return u;
9305 }
9306
9307 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 maxchar_new = 127;
9309 else if (maxchar_new <= 255)
9310 maxchar_new = 255;
9311 else if (maxchar_new <= 65535)
9312 maxchar_new = 65535;
9313 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009314 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315
Victor Stinnereaab6042011-12-11 22:22:39 +01009316 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009318
9319 /* In case the maximum character changed, we need to
9320 convert the string to the new category. */
9321 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9322 if (v == NULL) {
9323 Py_DECREF(u);
9324 return NULL;
9325 }
9326 if (maxchar_new > maxchar_old) {
9327 /* If the maxchar increased so that the kind changed, not all
9328 characters are representable anymore and we need to fix the
9329 string again. This only happens in very few cases. */
9330 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9331 maxchar_old = fixfct(v);
9332 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 }
9334 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009335 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009337 Py_DECREF(u);
9338 assert(_PyUnicode_CheckConsistency(v, 1));
9339 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340}
9341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009343fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 /* No need to call PyUnicode_READY(self) because this function is only
9346 called as a callback from fixup() which does it already. */
9347 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9348 const int kind = PyUnicode_KIND(self);
9349 void *data = PyUnicode_DATA(self);
9350 int touched = 0;
9351 Py_UCS4 maxchar = 0;
9352 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 for (i = 0; i < len; ++i) {
9355 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9356 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9357 if (up != ch) {
9358 if (up > maxchar)
9359 maxchar = up;
9360 PyUnicode_WRITE(kind, data, i, up);
9361 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 else if (ch > maxchar)
9364 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 }
9366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 if (touched)
9368 return maxchar;
9369 else
9370 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371}
9372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009374fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9377 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9378 const int kind = PyUnicode_KIND(self);
9379 void *data = PyUnicode_DATA(self);
9380 int touched = 0;
9381 Py_UCS4 maxchar = 0;
9382 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 for(i = 0; i < len; ++i) {
9385 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9386 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9387 if (lo != ch) {
9388 if (lo > maxchar)
9389 maxchar = lo;
9390 PyUnicode_WRITE(kind, data, i, lo);
9391 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 else if (ch > maxchar)
9394 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 }
9396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397 if (touched)
9398 return maxchar;
9399 else
9400 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401}
9402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009404fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9407 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9408 const int kind = PyUnicode_KIND(self);
9409 void *data = PyUnicode_DATA(self);
9410 int touched = 0;
9411 Py_UCS4 maxchar = 0;
9412 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 for(i = 0; i < len; ++i) {
9415 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9416 Py_UCS4 nu = 0;
9417
9418 if (Py_UNICODE_ISUPPER(ch))
9419 nu = Py_UNICODE_TOLOWER(ch);
9420 else if (Py_UNICODE_ISLOWER(ch))
9421 nu = Py_UNICODE_TOUPPER(ch);
9422
9423 if (nu != 0) {
9424 if (nu > maxchar)
9425 maxchar = nu;
9426 PyUnicode_WRITE(kind, data, i, nu);
9427 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 else if (ch > maxchar)
9430 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 }
9432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (touched)
9434 return maxchar;
9435 else
9436 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437}
9438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009440fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9443 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9444 const int kind = PyUnicode_KIND(self);
9445 void *data = PyUnicode_DATA(self);
9446 int touched = 0;
9447 Py_UCS4 maxchar = 0;
9448 Py_ssize_t i = 0;
9449 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009450
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009451 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453
9454 ch = PyUnicode_READ(kind, data, i);
9455 if (!Py_UNICODE_ISUPPER(ch)) {
9456 maxchar = Py_UNICODE_TOUPPER(ch);
9457 PyUnicode_WRITE(kind, data, i, maxchar);
9458 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 ++i;
9461 for(; i < len; ++i) {
9462 ch = PyUnicode_READ(kind, data, i);
9463 if (!Py_UNICODE_ISLOWER(ch)) {
9464 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9465 if (lo > maxchar)
9466 maxchar = lo;
9467 PyUnicode_WRITE(kind, data, i, lo);
9468 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 else if (ch > maxchar)
9471 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473
9474 if (touched)
9475 return maxchar;
9476 else
9477 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009481fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9484 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9485 const int kind = PyUnicode_KIND(self);
9486 void *data = PyUnicode_DATA(self);
9487 Py_UCS4 maxchar = 0;
9488 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 int previous_is_cased;
9490
9491 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 if (len == 1) {
9493 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9494 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9495 if (ti != ch) {
9496 PyUnicode_WRITE(kind, data, i, ti);
9497 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 }
9499 else
9500 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 for(; i < len; ++i) {
9504 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9505 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009506
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 nu = Py_UNICODE_TOTITLE(ch);
9511
9512 if (nu > maxchar)
9513 maxchar = nu;
9514 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009515
Benjamin Peterson29060642009-01-31 22:14:21 +00009516 if (Py_UNICODE_ISLOWER(ch) ||
9517 Py_UNICODE_ISUPPER(ch) ||
9518 Py_UNICODE_ISTITLE(ch))
9519 previous_is_cased = 1;
9520 else
9521 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524}
9525
Tim Peters8ce9f162004-08-27 01:49:32 +00009526PyObject *
9527PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009530 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009532 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009533 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9534 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009535 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009537 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009539 int use_memcpy;
9540 unsigned char *res_data = NULL, *sep_data = NULL;
9541 PyObject *last_obj;
9542 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543
Tim Peters05eba1f2004-08-27 21:32:02 +00009544 fseq = PySequence_Fast(seq, "");
9545 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009546 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009547 }
9548
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009549 /* NOTE: the following code can't call back into Python code,
9550 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009551 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009552
Tim Peters05eba1f2004-08-27 21:32:02 +00009553 seqlen = PySequence_Fast_GET_SIZE(fseq);
9554 /* If empty sequence, return u"". */
9555 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009556 Py_DECREF(fseq);
9557 Py_INCREF(unicode_empty);
9558 res = unicode_empty;
9559 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009560 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009561
Tim Peters05eba1f2004-08-27 21:32:02 +00009562 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009563 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009564 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009565 if (seqlen == 1) {
9566 if (PyUnicode_CheckExact(items[0])) {
9567 res = items[0];
9568 Py_INCREF(res);
9569 Py_DECREF(fseq);
9570 return res;
9571 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009572 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009573 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009574 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009575 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009576 /* Set up sep and seplen */
9577 if (separator == NULL) {
9578 /* fall back to a blank space separator */
9579 sep = PyUnicode_FromOrdinal(' ');
9580 if (!sep)
9581 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009582 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009583 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009584 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009585 else {
9586 if (!PyUnicode_Check(separator)) {
9587 PyErr_Format(PyExc_TypeError,
9588 "separator: expected str instance,"
9589 " %.80s found",
9590 Py_TYPE(separator)->tp_name);
9591 goto onError;
9592 }
9593 if (PyUnicode_READY(separator))
9594 goto onError;
9595 sep = separator;
9596 seplen = PyUnicode_GET_LENGTH(separator);
9597 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9598 /* inc refcount to keep this code path symmetric with the
9599 above case of a blank separator */
9600 Py_INCREF(sep);
9601 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009602 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009603 }
9604
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009605 /* There are at least two things to join, or else we have a subclass
9606 * of str in the sequence.
9607 * Do a pre-pass to figure out the total amount of space we'll
9608 * need (sz), and see whether all argument are strings.
9609 */
9610 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009611#ifdef Py_DEBUG
9612 use_memcpy = 0;
9613#else
9614 use_memcpy = 1;
9615#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009616 for (i = 0; i < seqlen; i++) {
9617 const Py_ssize_t old_sz = sz;
9618 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009619 if (!PyUnicode_Check(item)) {
9620 PyErr_Format(PyExc_TypeError,
9621 "sequence item %zd: expected str instance,"
9622 " %.80s found",
9623 i, Py_TYPE(item)->tp_name);
9624 goto onError;
9625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 if (PyUnicode_READY(item) == -1)
9627 goto onError;
9628 sz += PyUnicode_GET_LENGTH(item);
9629 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009630 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009631 if (i != 0)
9632 sz += seplen;
9633 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9634 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009636 goto onError;
9637 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009638 if (use_memcpy && last_obj != NULL) {
9639 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9640 use_memcpy = 0;
9641 }
9642 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009643 }
Tim Petersced69f82003-09-16 20:30:58 +00009644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009646 if (res == NULL)
9647 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009648
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009649 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009650#ifdef Py_DEBUG
9651 use_memcpy = 0;
9652#else
9653 if (use_memcpy) {
9654 res_data = PyUnicode_1BYTE_DATA(res);
9655 kind = PyUnicode_KIND(res);
9656 if (seplen != 0)
9657 sep_data = PyUnicode_1BYTE_DATA(sep);
9658 }
9659#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009661 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009662 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009664 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009665 if (use_memcpy) {
9666 Py_MEMCPY(res_data,
9667 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009668 kind * seplen);
9669 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 }
9671 else {
9672 copy_characters(res, res_offset, sep, 0, seplen);
9673 res_offset += seplen;
9674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009676 itemlen = PyUnicode_GET_LENGTH(item);
9677 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 if (use_memcpy) {
9679 Py_MEMCPY(res_data,
9680 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009681 kind * itemlen);
9682 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 }
9684 else {
9685 copy_characters(res, res_offset, item, 0, itemlen);
9686 res_offset += itemlen;
9687 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009688 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009689 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009690 if (use_memcpy)
9691 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009692 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009693 else
9694 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009695
Tim Peters05eba1f2004-08-27 21:32:02 +00009696 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009698 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009702 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009704 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705 return NULL;
9706}
9707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708#define FILL(kind, data, value, start, length) \
9709 do { \
9710 Py_ssize_t i_ = 0; \
9711 assert(kind != PyUnicode_WCHAR_KIND); \
9712 switch ((kind)) { \
9713 case PyUnicode_1BYTE_KIND: { \
9714 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9715 memset(to_, (unsigned char)value, length); \
9716 break; \
9717 } \
9718 case PyUnicode_2BYTE_KIND: { \
9719 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9720 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9721 break; \
9722 } \
9723 default: { \
9724 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9725 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9726 break; \
9727 } \
9728 } \
9729 } while (0)
9730
Victor Stinner9310abb2011-10-05 00:59:23 +02009731static PyObject *
9732pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009733 Py_ssize_t left,
9734 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737 PyObject *u;
9738 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009739 int kind;
9740 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741
9742 if (left < 0)
9743 left = 0;
9744 if (right < 0)
9745 right = 0;
9746
Victor Stinnerc4b49542011-12-11 22:44:26 +01009747 if (left == 0 && right == 0)
9748 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9751 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009752 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9753 return NULL;
9754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9756 if (fill > maxchar)
9757 maxchar = fill;
9758 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009759 if (!u)
9760 return NULL;
9761
9762 kind = PyUnicode_KIND(u);
9763 data = PyUnicode_DATA(u);
9764 if (left)
9765 FILL(kind, data, fill, 0, left);
9766 if (right)
9767 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009768 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009769 assert(_PyUnicode_CheckConsistency(u, 1));
9770 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
Alexander Belopolsky40018472011-02-26 01:02:56 +00009774PyObject *
9775PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009776{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778
9779 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 switch(PyUnicode_KIND(string)) {
9784 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009785 if (PyUnicode_IS_ASCII(string))
9786 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009787 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009788 PyUnicode_GET_LENGTH(string), keepends);
9789 else
9790 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009791 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009792 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 break;
9794 case PyUnicode_2BYTE_KIND:
9795 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009796 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 PyUnicode_GET_LENGTH(string), keepends);
9798 break;
9799 case PyUnicode_4BYTE_KIND:
9800 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009801 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 PyUnicode_GET_LENGTH(string), keepends);
9803 break;
9804 default:
9805 assert(0);
9806 list = 0;
9807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 Py_DECREF(string);
9809 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810}
9811
Alexander Belopolsky40018472011-02-26 01:02:56 +00009812static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009813split(PyObject *self,
9814 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009815 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 int kind1, kind2, kind;
9818 void *buf1, *buf2;
9819 Py_ssize_t len1, len2;
9820 PyObject* out;
9821
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009823 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 if (PyUnicode_READY(self) == -1)
9826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (substring == NULL)
9829 switch(PyUnicode_KIND(self)) {
9830 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009831 if (PyUnicode_IS_ASCII(self))
9832 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 PyUnicode_GET_LENGTH(self), maxcount
9835 );
9836 else
9837 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009838 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009839 PyUnicode_GET_LENGTH(self), maxcount
9840 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 case PyUnicode_2BYTE_KIND:
9842 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009843 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 PyUnicode_GET_LENGTH(self), maxcount
9845 );
9846 case PyUnicode_4BYTE_KIND:
9847 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009848 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 PyUnicode_GET_LENGTH(self), maxcount
9850 );
9851 default:
9852 assert(0);
9853 return NULL;
9854 }
9855
9856 if (PyUnicode_READY(substring) == -1)
9857 return NULL;
9858
9859 kind1 = PyUnicode_KIND(self);
9860 kind2 = PyUnicode_KIND(substring);
9861 kind = kind1 > kind2 ? kind1 : kind2;
9862 buf1 = PyUnicode_DATA(self);
9863 buf2 = PyUnicode_DATA(substring);
9864 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 if (!buf1)
9867 return NULL;
9868 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (!buf2) {
9871 if (kind1 != kind) PyMem_Free(buf1);
9872 return NULL;
9873 }
9874 len1 = PyUnicode_GET_LENGTH(self);
9875 len2 = PyUnicode_GET_LENGTH(substring);
9876
9877 switch(kind) {
9878 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009879 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9880 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009881 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009882 else
9883 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009884 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 break;
9886 case PyUnicode_2BYTE_KIND:
9887 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009888 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 break;
9890 case PyUnicode_4BYTE_KIND:
9891 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009892 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 break;
9894 default:
9895 out = NULL;
9896 }
9897 if (kind1 != kind)
9898 PyMem_Free(buf1);
9899 if (kind2 != kind)
9900 PyMem_Free(buf2);
9901 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902}
9903
Alexander Belopolsky40018472011-02-26 01:02:56 +00009904static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009905rsplit(PyObject *self,
9906 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009907 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 int kind1, kind2, kind;
9910 void *buf1, *buf2;
9911 Py_ssize_t len1, len2;
9912 PyObject* out;
9913
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009914 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009915 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (PyUnicode_READY(self) == -1)
9918 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (substring == NULL)
9921 switch(PyUnicode_KIND(self)) {
9922 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009923 if (PyUnicode_IS_ASCII(self))
9924 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 else
9929 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009930 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009931 PyUnicode_GET_LENGTH(self), maxcount
9932 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 case PyUnicode_2BYTE_KIND:
9934 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009935 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 PyUnicode_GET_LENGTH(self), maxcount
9937 );
9938 case PyUnicode_4BYTE_KIND:
9939 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009940 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 PyUnicode_GET_LENGTH(self), maxcount
9942 );
9943 default:
9944 assert(0);
9945 return NULL;
9946 }
9947
9948 if (PyUnicode_READY(substring) == -1)
9949 return NULL;
9950
9951 kind1 = PyUnicode_KIND(self);
9952 kind2 = PyUnicode_KIND(substring);
9953 kind = kind1 > kind2 ? kind1 : kind2;
9954 buf1 = PyUnicode_DATA(self);
9955 buf2 = PyUnicode_DATA(substring);
9956 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 if (!buf1)
9959 return NULL;
9960 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009961 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (!buf2) {
9963 if (kind1 != kind) PyMem_Free(buf1);
9964 return NULL;
9965 }
9966 len1 = PyUnicode_GET_LENGTH(self);
9967 len2 = PyUnicode_GET_LENGTH(substring);
9968
9969 switch(kind) {
9970 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9972 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009973 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974 else
9975 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009976 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 break;
9978 case PyUnicode_2BYTE_KIND:
9979 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009980 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 break;
9982 case PyUnicode_4BYTE_KIND:
9983 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009984 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 break;
9986 default:
9987 out = NULL;
9988 }
9989 if (kind1 != kind)
9990 PyMem_Free(buf1);
9991 if (kind2 != kind)
9992 PyMem_Free(buf2);
9993 return out;
9994}
9995
9996static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009997anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9998 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999{
10000 switch(kind) {
10001 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010002 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10003 return asciilib_find(buf1, len1, buf2, len2, offset);
10004 else
10005 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 case PyUnicode_2BYTE_KIND:
10007 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10008 case PyUnicode_4BYTE_KIND:
10009 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10010 }
10011 assert(0);
10012 return -1;
10013}
10014
10015static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010016anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10017 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018{
10019 switch(kind) {
10020 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010021 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10022 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10023 else
10024 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 case PyUnicode_2BYTE_KIND:
10026 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10027 case PyUnicode_4BYTE_KIND:
10028 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10029 }
10030 assert(0);
10031 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010032}
10033
Alexander Belopolsky40018472011-02-26 01:02:56 +000010034static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035replace(PyObject *self, PyObject *str1,
10036 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 PyObject *u;
10039 char *sbuf = PyUnicode_DATA(self);
10040 char *buf1 = PyUnicode_DATA(str1);
10041 char *buf2 = PyUnicode_DATA(str2);
10042 int srelease = 0, release1 = 0, release2 = 0;
10043 int skind = PyUnicode_KIND(self);
10044 int kind1 = PyUnicode_KIND(str1);
10045 int kind2 = PyUnicode_KIND(str2);
10046 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10047 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10048 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010049 int mayshrink;
10050 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051
10052 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010055 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056
Victor Stinner59de0ee2011-10-07 10:01:28 +020010057 if (str1 == str2)
10058 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (skind < kind1)
10060 /* substring too wide to be present */
10061 goto nothing;
10062
Victor Stinner49a0a212011-10-12 23:46:10 +020010063 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10064 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10065 /* Replacing str1 with str2 may cause a maxchar reduction in the
10066 result string. */
10067 mayshrink = (maxchar_str2 < maxchar);
10068 maxchar = Py_MAX(maxchar, maxchar_str2);
10069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010071 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010074 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 Py_UCS4 u1, u2;
10078 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010080 if (findchar(sbuf, PyUnicode_KIND(self),
10081 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010087 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 rkind = PyUnicode_KIND(u);
10089 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10090 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010091 if (--maxcount < 0)
10092 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010094 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 }
10096 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 int rkind = skind;
10098 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 if (kind1 < rkind) {
10101 /* widen substring */
10102 buf1 = _PyUnicode_AsKind(str1, rkind);
10103 if (!buf1) goto error;
10104 release1 = 1;
10105 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010107 if (i < 0)
10108 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (rkind > kind2) {
10110 /* widen replacement */
10111 buf2 = _PyUnicode_AsKind(str2, rkind);
10112 if (!buf2) goto error;
10113 release2 = 1;
10114 }
10115 else if (rkind < kind2) {
10116 /* widen self and buf1 */
10117 rkind = kind2;
10118 if (release1) PyMem_Free(buf1);
10119 sbuf = _PyUnicode_AsKind(self, rkind);
10120 if (!sbuf) goto error;
10121 srelease = 1;
10122 buf1 = _PyUnicode_AsKind(str1, rkind);
10123 if (!buf1) goto error;
10124 release1 = 1;
10125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 u = PyUnicode_New(slen, maxchar);
10127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 assert(PyUnicode_KIND(u) == rkind);
10130 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010131
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138
10139 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 if (i == -1)
10144 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 }
10152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t n, i, j, ires;
10154 Py_ssize_t product, new_size;
10155 int rkind = skind;
10156 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (n == 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = kind2;
10176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 if (release1) PyMem_Free(buf1);
10180 buf1 = _PyUnicode_AsKind(str1, rkind);
10181 if (!buf1) goto error;
10182 release1 = 1;
10183 }
10184 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10185 PyUnicode_GET_LENGTH(str1))); */
10186 product = n * (len2-len1);
10187 if ((product / (len2-len1)) != n) {
10188 PyErr_SetString(PyExc_OverflowError,
10189 "replace string is too long");
10190 goto error;
10191 }
10192 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 if (new_size == 0) {
10194 Py_INCREF(unicode_empty);
10195 u = unicode_empty;
10196 goto done;
10197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10199 PyErr_SetString(PyExc_OverflowError,
10200 "replace string is too long");
10201 goto error;
10202 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010203 u = PyUnicode_New(new_size, maxchar);
10204 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010206 assert(PyUnicode_KIND(u) == rkind);
10207 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 ires = i = 0;
10209 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210 while (n-- > 0) {
10211 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010213 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010215 if (j == -1)
10216 break;
10217 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010218 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 memcpy(res + rkind * ires,
10220 sbuf + rkind * i,
10221 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010223 }
10224 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010235 memcpy(res + rkind * ires,
10236 sbuf + rkind * i,
10237 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010238 }
10239 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010240 /* interleave */
10241 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010242 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010246 if (--n <= 0)
10247 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 memcpy(res + rkind * ires,
10249 sbuf + rkind * i,
10250 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 ires++;
10252 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010253 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010254 memcpy(res + rkind * ires,
10255 sbuf + rkind * i,
10256 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 }
10259
10260 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010261 unicode_adjust_maxchar(&u);
10262 if (u == NULL)
10263 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010265
10266 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (srelease)
10268 PyMem_FREE(sbuf);
10269 if (release1)
10270 PyMem_FREE(buf1);
10271 if (release2)
10272 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010273 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010275
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 if (srelease)
10279 PyMem_FREE(sbuf);
10280 if (release1)
10281 PyMem_FREE(buf1);
10282 if (release2)
10283 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010284 return unicode_result_unchanged(self);
10285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 error:
10287 if (srelease && sbuf)
10288 PyMem_FREE(sbuf);
10289 if (release1 && buf1)
10290 PyMem_FREE(buf1);
10291 if (release2 && buf2)
10292 PyMem_FREE(buf2);
10293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294}
10295
10296/* --- Unicode Object Methods --------------------------------------------- */
10297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010298PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010299 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300\n\
10301Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010302characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
10304static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010305unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307 return fixup(self, fixtitle);
10308}
10309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010310PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312\n\
10313Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010314have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
10316static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010317unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 return fixup(self, fixcapitalize);
10320}
10321
10322#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010323PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010324 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325\n\
10326Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010327normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328
10329static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010330unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331{
10332 PyObject *list;
10333 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010334 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336 /* Split into words */
10337 list = split(self, NULL, -1);
10338 if (!list)
10339 return NULL;
10340
10341 /* Capitalize each word */
10342 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010343 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345 if (item == NULL)
10346 goto onError;
10347 Py_DECREF(PyList_GET_ITEM(list, i));
10348 PyList_SET_ITEM(list, i, item);
10349 }
10350
10351 /* Join the words to form a new string */
10352 item = PyUnicode_Join(NULL, list);
10353
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010356 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357}
10358#endif
10359
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010360/* Argument converter. Coerces to a single unicode character */
10361
10362static int
10363convert_uc(PyObject *obj, void *addr)
10364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010366 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010367
Benjamin Peterson14339b62009-01-31 16:36:08 +000010368 uniobj = PyUnicode_FromObject(obj);
10369 if (uniobj == NULL) {
10370 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010371 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010372 return 0;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010375 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010376 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010377 Py_DECREF(uniobj);
10378 return 0;
10379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010381 Py_DECREF(uniobj);
10382 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010383}
10384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010385PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010388Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010389done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390
10391static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010392unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010394 Py_ssize_t marg, left;
10395 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 Py_UCS4 fillchar = ' ';
10397
Victor Stinnere9a29352011-10-01 02:14:59 +020010398 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400
Victor Stinnerc4b49542011-12-11 22:44:26 +010010401 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402 return NULL;
10403
Victor Stinnerc4b49542011-12-11 22:44:26 +010010404 if (PyUnicode_GET_LENGTH(self) >= width)
10405 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406
Victor Stinnerc4b49542011-12-11 22:44:26 +010010407 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408 left = marg / 2 + (marg & width & 1);
10409
Victor Stinner9310abb2011-10-05 00:59:23 +020010410 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411}
10412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413/* This function assumes that str1 and str2 are readied by the caller. */
10414
Marc-André Lemburge5034372000-08-08 08:04:29 +000010415static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010416unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 int kind1, kind2;
10419 void *data1, *data2;
10420 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 kind1 = PyUnicode_KIND(str1);
10423 kind2 = PyUnicode_KIND(str2);
10424 data1 = PyUnicode_DATA(str1);
10425 data2 = PyUnicode_DATA(str2);
10426 len1 = PyUnicode_GET_LENGTH(str1);
10427 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 for (i = 0; i < len1 && i < len2; ++i) {
10430 Py_UCS4 c1, c2;
10431 c1 = PyUnicode_READ(kind1, data1, i);
10432 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010433
10434 if (c1 != c2)
10435 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010436 }
10437
10438 return (len1 < len2) ? -1 : (len1 != len2);
10439}
10440
Alexander Belopolsky40018472011-02-26 01:02:56 +000010441int
10442PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10445 if (PyUnicode_READY(left) == -1 ||
10446 PyUnicode_READY(right) == -1)
10447 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010448 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010450 PyErr_Format(PyExc_TypeError,
10451 "Can't compare %.100s and %.100s",
10452 left->ob_type->tp_name,
10453 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454 return -1;
10455}
10456
Martin v. Löwis5b222132007-06-10 09:51:05 +000010457int
10458PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 Py_ssize_t i;
10461 int kind;
10462 void *data;
10463 Py_UCS4 chr;
10464
Victor Stinner910337b2011-10-03 03:20:16 +020010465 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (PyUnicode_READY(uni) == -1)
10467 return -1;
10468 kind = PyUnicode_KIND(uni);
10469 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010470 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10472 if (chr != str[i])
10473 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010474 /* This check keeps Python strings that end in '\0' from comparing equal
10475 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010478 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010480 return 0;
10481}
10482
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010483
Benjamin Peterson29060642009-01-31 22:14:21 +000010484#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010485 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010486
Alexander Belopolsky40018472011-02-26 01:02:56 +000010487PyObject *
10488PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010489{
10490 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010491
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010492 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10493 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (PyUnicode_READY(left) == -1 ||
10495 PyUnicode_READY(right) == -1)
10496 return NULL;
10497 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10498 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010499 if (op == Py_EQ) {
10500 Py_INCREF(Py_False);
10501 return Py_False;
10502 }
10503 if (op == Py_NE) {
10504 Py_INCREF(Py_True);
10505 return Py_True;
10506 }
10507 }
10508 if (left == right)
10509 result = 0;
10510 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010511 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010512
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010513 /* Convert the return value to a Boolean */
10514 switch (op) {
10515 case Py_EQ:
10516 v = TEST_COND(result == 0);
10517 break;
10518 case Py_NE:
10519 v = TEST_COND(result != 0);
10520 break;
10521 case Py_LE:
10522 v = TEST_COND(result <= 0);
10523 break;
10524 case Py_GE:
10525 v = TEST_COND(result >= 0);
10526 break;
10527 case Py_LT:
10528 v = TEST_COND(result == -1);
10529 break;
10530 case Py_GT:
10531 v = TEST_COND(result == 1);
10532 break;
10533 default:
10534 PyErr_BadArgument();
10535 return NULL;
10536 }
10537 Py_INCREF(v);
10538 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010539 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540
Brian Curtindfc80e32011-08-10 20:28:54 -050010541 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010542}
10543
Alexander Belopolsky40018472011-02-26 01:02:56 +000010544int
10545PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010546{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 int kind1, kind2, kind;
10549 void *buf1, *buf2;
10550 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010551 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010552
10553 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010554 sub = PyUnicode_FromObject(element);
10555 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 PyErr_Format(PyExc_TypeError,
10557 "'in <string>' requires string as left operand, not %s",
10558 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010559 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (PyUnicode_READY(sub) == -1)
10562 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010563
Thomas Wouters477c8d52006-05-27 19:21:47 +000010564 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010565 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010566 Py_DECREF(sub);
10567 return -1;
10568 }
10569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 kind1 = PyUnicode_KIND(str);
10571 kind2 = PyUnicode_KIND(sub);
10572 kind = kind1 > kind2 ? kind1 : kind2;
10573 buf1 = PyUnicode_DATA(str);
10574 buf2 = PyUnicode_DATA(sub);
10575 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010576 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (!buf1) {
10578 Py_DECREF(sub);
10579 return -1;
10580 }
10581 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010582 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (!buf2) {
10584 Py_DECREF(sub);
10585 if (kind1 != kind) PyMem_Free(buf1);
10586 return -1;
10587 }
10588 len1 = PyUnicode_GET_LENGTH(str);
10589 len2 = PyUnicode_GET_LENGTH(sub);
10590
10591 switch(kind) {
10592 case PyUnicode_1BYTE_KIND:
10593 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10594 break;
10595 case PyUnicode_2BYTE_KIND:
10596 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10597 break;
10598 case PyUnicode_4BYTE_KIND:
10599 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10600 break;
10601 default:
10602 result = -1;
10603 assert(0);
10604 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605
10606 Py_DECREF(str);
10607 Py_DECREF(sub);
10608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (kind1 != kind)
10610 PyMem_Free(buf1);
10611 if (kind2 != kind)
10612 PyMem_Free(buf2);
10613
Guido van Rossum403d68b2000-03-13 15:55:09 +000010614 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010615}
10616
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617/* Concat to string or Unicode object giving a new Unicode object. */
10618
Alexander Belopolsky40018472011-02-26 01:02:56 +000010619PyObject *
10620PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010623 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010624 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625
10626 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010629 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010632 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
10634 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010635 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010639 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 }
10643
Victor Stinner488fa492011-12-12 00:01:39 +010010644 u_len = PyUnicode_GET_LENGTH(u);
10645 v_len = PyUnicode_GET_LENGTH(v);
10646 if (u_len > PY_SSIZE_T_MAX - v_len) {
10647 PyErr_SetString(PyExc_OverflowError,
10648 "strings are too large to concat");
10649 goto onError;
10650 }
10651 new_len = u_len + v_len;
10652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010654 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10655 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010658 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010660 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010661 copy_characters(w, 0, u, 0, u_len);
10662 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663 Py_DECREF(u);
10664 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010665 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667
Benjamin Peterson29060642009-01-31 22:14:21 +000010668 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010669 Py_XDECREF(u);
10670 Py_XDECREF(v);
10671 return NULL;
10672}
10673
Walter Dörwald1ab83302007-05-18 17:15:44 +000010674void
Victor Stinner23e56682011-10-03 03:54:37 +020010675PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010676{
Victor Stinner23e56682011-10-03 03:54:37 +020010677 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010678 Py_UCS4 maxchar, maxchar2;
10679 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010680
10681 if (p_left == NULL) {
10682 if (!PyErr_Occurred())
10683 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010684 return;
10685 }
Victor Stinner23e56682011-10-03 03:54:37 +020010686 left = *p_left;
10687 if (right == NULL || !PyUnicode_Check(left)) {
10688 if (!PyErr_Occurred())
10689 PyErr_BadInternalCall();
10690 goto error;
10691 }
10692
Victor Stinnere1335c72011-10-04 20:53:03 +020010693 if (PyUnicode_READY(left))
10694 goto error;
10695 if (PyUnicode_READY(right))
10696 goto error;
10697
Victor Stinner488fa492011-12-12 00:01:39 +010010698 /* Shortcuts */
10699 if (left == unicode_empty) {
10700 Py_DECREF(left);
10701 Py_INCREF(right);
10702 *p_left = right;
10703 return;
10704 }
10705 if (right == unicode_empty)
10706 return;
10707
10708 left_len = PyUnicode_GET_LENGTH(left);
10709 right_len = PyUnicode_GET_LENGTH(right);
10710 if (left_len > PY_SSIZE_T_MAX - right_len) {
10711 PyErr_SetString(PyExc_OverflowError,
10712 "strings are too large to concat");
10713 goto error;
10714 }
10715 new_len = left_len + right_len;
10716
10717 if (unicode_modifiable(left)
10718 && PyUnicode_CheckExact(right)
10719 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010720 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10721 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010722 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010723 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010724 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10725 {
10726 /* append inplace */
10727 if (unicode_resize(p_left, new_len) != 0) {
10728 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10729 * deallocated so it cannot be put back into
10730 * 'variable'. The MemoryError is raised when there
10731 * is no value in 'variable', which might (very
10732 * remotely) be a cause of incompatibilities.
10733 */
10734 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010735 }
Victor Stinner488fa492011-12-12 00:01:39 +010010736 /* copy 'right' into the newly allocated area of 'left' */
10737 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010738 }
Victor Stinner488fa492011-12-12 00:01:39 +010010739 else {
10740 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10741 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10742 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010743
Victor Stinner488fa492011-12-12 00:01:39 +010010744 /* Concat the two Unicode strings */
10745 res = PyUnicode_New(new_len, maxchar);
10746 if (res == NULL)
10747 goto error;
10748 copy_characters(res, 0, left, 0, left_len);
10749 copy_characters(res, left_len, right, 0, right_len);
10750 Py_DECREF(left);
10751 *p_left = res;
10752 }
10753 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010754 return;
10755
10756error:
Victor Stinner488fa492011-12-12 00:01:39 +010010757 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010758}
10759
10760void
10761PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10762{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010763 PyUnicode_Append(pleft, right);
10764 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010765}
10766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010767PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010770Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010771string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010772interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
10774static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010775unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010777 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010778 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010779 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 int kind1, kind2, kind;
10782 void *buf1, *buf2;
10783 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Jesus Ceaac451502011-04-20 17:09:23 +020010785 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10786 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010787 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 kind1 = PyUnicode_KIND(self);
10790 kind2 = PyUnicode_KIND(substring);
10791 kind = kind1 > kind2 ? kind1 : kind2;
10792 buf1 = PyUnicode_DATA(self);
10793 buf2 = PyUnicode_DATA(substring);
10794 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010795 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 if (!buf1) {
10797 Py_DECREF(substring);
10798 return NULL;
10799 }
10800 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010801 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (!buf2) {
10803 Py_DECREF(substring);
10804 if (kind1 != kind) PyMem_Free(buf1);
10805 return NULL;
10806 }
10807 len1 = PyUnicode_GET_LENGTH(self);
10808 len2 = PyUnicode_GET_LENGTH(substring);
10809
10810 ADJUST_INDICES(start, end, len1);
10811 switch(kind) {
10812 case PyUnicode_1BYTE_KIND:
10813 iresult = ucs1lib_count(
10814 ((Py_UCS1*)buf1) + start, end - start,
10815 buf2, len2, PY_SSIZE_T_MAX
10816 );
10817 break;
10818 case PyUnicode_2BYTE_KIND:
10819 iresult = ucs2lib_count(
10820 ((Py_UCS2*)buf1) + start, end - start,
10821 buf2, len2, PY_SSIZE_T_MAX
10822 );
10823 break;
10824 case PyUnicode_4BYTE_KIND:
10825 iresult = ucs4lib_count(
10826 ((Py_UCS4*)buf1) + start, end - start,
10827 buf2, len2, PY_SSIZE_T_MAX
10828 );
10829 break;
10830 default:
10831 assert(0); iresult = 0;
10832 }
10833
10834 result = PyLong_FromSsize_t(iresult);
10835
10836 if (kind1 != kind)
10837 PyMem_Free(buf1);
10838 if (kind2 != kind)
10839 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
10841 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010842
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 return result;
10844}
10845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010846PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010847 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010849Encode S using the codec registered for encoding. Default encoding\n\
10850is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010851handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010852a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10853'xmlcharrefreplace' as well as any other name registered with\n\
10854codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010857unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010859 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860 char *encoding = NULL;
10861 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010862
Benjamin Peterson308d6372009-09-18 21:42:35 +000010863 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10864 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010866 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010867}
10868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871\n\
10872Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010873If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
10875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010876unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 Py_ssize_t i, j, line_pos, src_len, incr;
10879 Py_UCS4 ch;
10880 PyObject *u;
10881 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010884 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885
10886 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
Antoine Pitrou22425222011-10-04 19:10:51 +020010889 if (PyUnicode_READY(self) == -1)
10890 return NULL;
10891
Thomas Wouters7e474022000-07-16 12:04:32 +000010892 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 src_len = PyUnicode_GET_LENGTH(self);
10894 i = j = line_pos = 0;
10895 kind = PyUnicode_KIND(self);
10896 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010897 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 for (; i < src_len; i++) {
10899 ch = PyUnicode_READ(kind, src_data, i);
10900 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010901 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 goto overflow;
10906 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010908 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 goto overflow;
10913 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010915 if (ch == '\n' || ch == '\r')
10916 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010918 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010919 if (!found)
10920 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010921
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924 if (!u)
10925 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 for (; i < src_len; i++) {
10931 ch = PyUnicode_READ(kind, src_data, i);
10932 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010933 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010934 incr = tabsize - (line_pos % tabsize);
10935 line_pos += incr;
10936 while (incr--) {
10937 PyUnicode_WRITE(kind, dest_data, j, ' ');
10938 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010939 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010941 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010943 line_pos++;
10944 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010945 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010946 if (ch == '\n' || ch == '\r')
10947 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010949 }
10950 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010951 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010952
Antoine Pitroue71d5742011-10-04 15:55:09 +020010953 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010954 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10955 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956}
10957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010958PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010959 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960\n\
10961Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010962such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963arguments start and end are interpreted as in slice notation.\n\
10964\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010970 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010971 Py_ssize_t start;
10972 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010973 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
Jesus Ceaac451502011-04-20 17:09:23 +020010975 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10976 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (PyUnicode_READY(self) == -1)
10980 return NULL;
10981 if (PyUnicode_READY(substring) == -1)
10982 return NULL;
10983
Victor Stinner7931d9a2011-11-04 00:22:48 +010010984 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985
10986 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 if (result == -2)
10989 return NULL;
10990
Christian Heimes217cfd12007-12-02 14:31:20 +000010991 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992}
10993
10994static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010995unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010997 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10998 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001}
11002
Guido van Rossumc2504932007-09-18 19:42:40 +000011003/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011004 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011005static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007{
Guido van Rossumc2504932007-09-18 19:42:40 +000011008 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011009 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (_PyUnicode_HASH(self) != -1)
11012 return _PyUnicode_HASH(self);
11013 if (PyUnicode_READY(self) == -1)
11014 return -1;
11015 len = PyUnicode_GET_LENGTH(self);
11016
11017 /* The hash function as a macro, gets expanded three times below. */
11018#define HASH(P) \
11019 x = (Py_uhash_t)*P << 7; \
11020 while (--len >= 0) \
11021 x = (1000003*x) ^ (Py_uhash_t)*P++;
11022
11023 switch (PyUnicode_KIND(self)) {
11024 case PyUnicode_1BYTE_KIND: {
11025 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11026 HASH(c);
11027 break;
11028 }
11029 case PyUnicode_2BYTE_KIND: {
11030 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11031 HASH(s);
11032 break;
11033 }
11034 default: {
11035 Py_UCS4 *l;
11036 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11037 "Impossible switch case in unicode_hash");
11038 l = PyUnicode_4BYTE_DATA(self);
11039 HASH(l);
11040 break;
11041 }
11042 }
11043 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11044
Guido van Rossumc2504932007-09-18 19:42:40 +000011045 if (x == -1)
11046 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011048 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056
11057static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011060 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011061 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011062 Py_ssize_t start;
11063 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Jesus Ceaac451502011-04-20 17:09:23 +020011065 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11066 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (PyUnicode_READY(self) == -1)
11070 return NULL;
11071 if (PyUnicode_READY(substring) == -1)
11072 return NULL;
11073
Victor Stinner7931d9a2011-11-04 00:22:48 +010011074 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
11076 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 if (result == -2)
11079 return NULL;
11080
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081 if (result < 0) {
11082 PyErr_SetString(PyExc_ValueError, "substring not found");
11083 return NULL;
11084 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011085
Christian Heimes217cfd12007-12-02 14:31:20 +000011086 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011089PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011090 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011092Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011093at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
11095static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011096unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 Py_ssize_t i, length;
11099 int kind;
11100 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 int cased;
11102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
11105 length = PyUnicode_GET_LENGTH(self);
11106 kind = PyUnicode_KIND(self);
11107 data = PyUnicode_DATA(self);
11108
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (length == 1)
11111 return PyBool_FromLong(
11112 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011114 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011117
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 for (i = 0; i < length; i++) {
11120 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011121
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11123 return PyBool_FromLong(0);
11124 else if (!cased && Py_UNICODE_ISLOWER(ch))
11125 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011127 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128}
11129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011130PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011133Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011134at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
11136static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011137unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 Py_ssize_t i, length;
11140 int kind;
11141 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 int cased;
11143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 if (PyUnicode_READY(self) == -1)
11145 return NULL;
11146 length = PyUnicode_GET_LENGTH(self);
11147 kind = PyUnicode_KIND(self);
11148 data = PyUnicode_DATA(self);
11149
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (length == 1)
11152 return PyBool_FromLong(
11153 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011155 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011157 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011158
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 for (i = 0; i < length; i++) {
11161 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011162
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11164 return PyBool_FromLong(0);
11165 else if (!cased && Py_UNICODE_ISUPPER(ch))
11166 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011168 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169}
11170
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011172 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011174Return True if S is a titlecased string and there is at least one\n\
11175character in S, i.e. upper- and titlecase characters may only\n\
11176follow uncased characters and lowercase characters only cased ones.\n\
11177Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178
11179static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011180unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 Py_ssize_t i, length;
11183 int kind;
11184 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185 int cased, previous_is_cased;
11186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (PyUnicode_READY(self) == -1)
11188 return NULL;
11189 length = PyUnicode_GET_LENGTH(self);
11190 kind = PyUnicode_KIND(self);
11191 data = PyUnicode_DATA(self);
11192
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 if (length == 1) {
11195 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11196 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11197 (Py_UNICODE_ISUPPER(ch) != 0));
11198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 cased = 0;
11205 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 for (i = 0; i < length; i++) {
11207 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011208
Benjamin Peterson29060642009-01-31 22:14:21 +000011209 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11210 if (previous_is_cased)
11211 return PyBool_FromLong(0);
11212 previous_is_cased = 1;
11213 cased = 1;
11214 }
11215 else if (Py_UNICODE_ISLOWER(ch)) {
11216 if (!previous_is_cased)
11217 return PyBool_FromLong(0);
11218 previous_is_cased = 1;
11219 cased = 1;
11220 }
11221 else
11222 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011224 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011228 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011230Return True if all characters in S are whitespace\n\
11231and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011234unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 Py_ssize_t i, length;
11237 int kind;
11238 void *data;
11239
11240 if (PyUnicode_READY(self) == -1)
11241 return NULL;
11242 length = PyUnicode_GET_LENGTH(self);
11243 kind = PyUnicode_KIND(self);
11244 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 1)
11248 return PyBool_FromLong(
11249 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011251 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 for (i = 0; i < length; i++) {
11256 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011257 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011260 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261}
11262
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011263PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011265\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011266Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011267and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011268
11269static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011270unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011272 Py_ssize_t i, length;
11273 int kind;
11274 void *data;
11275
11276 if (PyUnicode_READY(self) == -1)
11277 return NULL;
11278 length = PyUnicode_GET_LENGTH(self);
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011282 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 1)
11284 return PyBool_FromLong(
11285 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011286
11287 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 for (i = 0; i < length; i++) {
11292 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011295 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011296}
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011300\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011301Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011302and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303
11304static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011305unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 int kind;
11308 void *data;
11309 Py_ssize_t len, i;
11310
11311 if (PyUnicode_READY(self) == -1)
11312 return NULL;
11313
11314 kind = PyUnicode_KIND(self);
11315 data = PyUnicode_DATA(self);
11316 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011317
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319 if (len == 1) {
11320 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11321 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11322 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011323
11324 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 for (i = 0; i < len; i++) {
11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011330 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011333 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011334}
11335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011339Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011340False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011343unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 Py_ssize_t i, length;
11346 int kind;
11347 void *data;
11348
11349 if (PyUnicode_READY(self) == -1)
11350 return NULL;
11351 length = PyUnicode_GET_LENGTH(self);
11352 kind = PyUnicode_KIND(self);
11353 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (length == 1)
11357 return PyBool_FromLong(
11358 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011360 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 for (i = 0; i < length; i++) {
11365 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011368 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369}
11370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011371PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011374Return True if all characters in S are digits\n\
11375and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
11377static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011378unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 Py_ssize_t i, length;
11381 int kind;
11382 void *data;
11383
11384 if (PyUnicode_READY(self) == -1)
11385 return NULL;
11386 length = PyUnicode_GET_LENGTH(self);
11387 kind = PyUnicode_KIND(self);
11388 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 if (length == 1) {
11392 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11393 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11394 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011396 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 for (i = 0; i < length; i++) {
11401 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011404 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011410Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
11413static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011414unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011416 Py_ssize_t i, length;
11417 int kind;
11418 void *data;
11419
11420 if (PyUnicode_READY(self) == -1)
11421 return NULL;
11422 length = PyUnicode_GET_LENGTH(self);
11423 kind = PyUnicode_KIND(self);
11424 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (length == 1)
11428 return PyBool_FromLong(
11429 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011431 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 for (i = 0; i < length; i++) {
11436 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011439 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440}
11441
Martin v. Löwis47383402007-08-15 07:32:56 +000011442int
11443PyUnicode_IsIdentifier(PyObject *self)
11444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 int kind;
11446 void *data;
11447 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011448 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (PyUnicode_READY(self) == -1) {
11451 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 }
11454
11455 /* Special case for empty strings */
11456 if (PyUnicode_GET_LENGTH(self) == 0)
11457 return 0;
11458 kind = PyUnicode_KIND(self);
11459 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011460
11461 /* PEP 3131 says that the first character must be in
11462 XID_Start and subsequent characters in XID_Continue,
11463 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011464 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011465 letters, digits, underscore). However, given the current
11466 definition of XID_Start and XID_Continue, it is sufficient
11467 to check just for these, except that _ must be allowed
11468 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011470 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011471 return 0;
11472
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011473 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011476 return 1;
11477}
11478
11479PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011481\n\
11482Return True if S is a valid identifier according\n\
11483to the language definition.");
11484
11485static PyObject*
11486unicode_isidentifier(PyObject *self)
11487{
11488 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11489}
11490
Georg Brandl559e5d72008-06-11 18:37:52 +000011491PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011493\n\
11494Return True if all characters in S are considered\n\
11495printable in repr() or S is empty, False otherwise.");
11496
11497static PyObject*
11498unicode_isprintable(PyObject *self)
11499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 Py_ssize_t i, length;
11501 int kind;
11502 void *data;
11503
11504 if (PyUnicode_READY(self) == -1)
11505 return NULL;
11506 length = PyUnicode_GET_LENGTH(self);
11507 kind = PyUnicode_KIND(self);
11508 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011509
11510 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (length == 1)
11512 return PyBool_FromLong(
11513 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 for (i = 0; i < length; i++) {
11516 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011517 Py_RETURN_FALSE;
11518 }
11519 }
11520 Py_RETURN_TRUE;
11521}
11522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011524 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525\n\
11526Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011527iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
11529static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011530unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011532 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533}
11534
Martin v. Löwis18e16552006-02-15 17:27:45 +000011535static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011536unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (PyUnicode_READY(self) == -1)
11539 return -1;
11540 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011543PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011546Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011547done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548
11549static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011550unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011552 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 Py_UCS4 fillchar = ' ';
11554
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011555 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 return NULL;
11557
Victor Stinnerc4b49542011-12-11 22:44:26 +010011558 if (PyUnicode_READY(self) < 0)
11559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
Victor Stinnerc4b49542011-12-11 22:44:26 +010011561 if (PyUnicode_GET_LENGTH(self) >= width)
11562 return unicode_result_unchanged(self);
11563
11564 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565}
11566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011570Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
11572static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011573unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 return fixup(self, fixlower);
11576}
11577
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011578#define LEFTSTRIP 0
11579#define RIGHTSTRIP 1
11580#define BOTHSTRIP 2
11581
11582/* Arrays indexed by above */
11583static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11584
11585#define STRIPNAME(i) (stripformat[i]+3)
11586
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011587/* externally visible for str.strip(unicode) */
11588PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 void *data;
11592 int kind;
11593 Py_ssize_t i, j, len;
11594 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11597 return NULL;
11598
11599 kind = PyUnicode_KIND(self);
11600 data = PyUnicode_DATA(self);
11601 len = PyUnicode_GET_LENGTH(self);
11602 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11603 PyUnicode_DATA(sepobj),
11604 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011605
Benjamin Peterson14339b62009-01-31 16:36:08 +000011606 i = 0;
11607 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 while (i < len &&
11609 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 i++;
11611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613
Benjamin Peterson14339b62009-01-31 16:36:08 +000011614 j = len;
11615 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 do {
11617 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 } while (j >= i &&
11619 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011620 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011622
Victor Stinner7931d9a2011-11-04 00:22:48 +010011623 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624}
11625
11626PyObject*
11627PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11628{
11629 unsigned char *data;
11630 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011631 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632
Victor Stinnerde636f32011-10-01 03:55:54 +020011633 if (PyUnicode_READY(self) == -1)
11634 return NULL;
11635
11636 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11637
Victor Stinner12bab6d2011-10-01 01:53:49 +020011638 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011639 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640
Victor Stinner12bab6d2011-10-01 01:53:49 +020011641 length = end - start;
11642 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011643 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644
Victor Stinnerde636f32011-10-01 03:55:54 +020011645 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011646 PyErr_SetString(PyExc_IndexError, "string index out of range");
11647 return NULL;
11648 }
11649
Victor Stinnerb9275c12011-10-05 14:01:42 +020011650 if (PyUnicode_IS_ASCII(self)) {
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_1BYTE_DATA(self);
11653 return unicode_fromascii(data + start, length);
11654 }
11655 else {
11656 kind = PyUnicode_KIND(self);
11657 data = PyUnicode_1BYTE_DATA(self);
11658 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011659 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011660 length);
11661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
11664static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 int kind;
11668 void *data;
11669 Py_ssize_t len, i, j;
11670
11671 if (PyUnicode_READY(self) == -1)
11672 return NULL;
11673
11674 kind = PyUnicode_KIND(self);
11675 data = PyUnicode_DATA(self);
11676 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677
Benjamin Peterson14339b62009-01-31 16:36:08 +000011678 i = 0;
11679 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011681 i++;
11682 }
11683 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011684
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 j = len;
11686 if (striptype != LEFTSTRIP) {
11687 do {
11688 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 j++;
11691 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692
Victor Stinner7931d9a2011-11-04 00:22:48 +010011693 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
11697static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011698do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11703 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 if (sep != NULL && sep != Py_None) {
11706 if (PyUnicode_Check(sep))
11707 return _PyUnicode_XStrip(self, striptype, sep);
11708 else {
11709 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "%s arg must be None or str",
11711 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 return NULL;
11713 }
11714 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717}
11718
11719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011720PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011722\n\
11723Return a copy of the string S with leading and trailing\n\
11724whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011725If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
11727static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 if (PyTuple_GET_SIZE(args) == 0)
11731 return do_strip(self, BOTHSTRIP); /* Common case */
11732 else
11733 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734}
11735
11736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011739\n\
11740Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011741If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742
11743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746 if (PyTuple_GET_SIZE(args) == 0)
11747 return do_strip(self, LEFTSTRIP); /* Common case */
11748 else
11749 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750}
11751
11752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011753PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755\n\
11756Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011757If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 if (PyTuple_GET_SIZE(args) == 0)
11763 return do_strip(self, RIGHTSTRIP); /* Common case */
11764 else
11765 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766}
11767
11768
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Georg Brandl222de0f2009-04-12 12:01:50 +000011775 if (len < 1) {
11776 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011777 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Victor Stinnerc4b49542011-12-11 22:44:26 +010011780 /* no repeat, return original string */
11781 if (len == 1)
11782 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011783
Victor Stinnerc4b49542011-12-11 22:44:26 +010011784 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return NULL;
11786
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011787 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011788 PyErr_SetString(PyExc_OverflowError,
11789 "repeated string is too long");
11790 return NULL;
11791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011793
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011794 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 if (!u)
11796 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011797 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (PyUnicode_GET_LENGTH(str) == 1) {
11800 const int kind = PyUnicode_KIND(str);
11801 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11802 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011803 if (kind == PyUnicode_1BYTE_KIND)
11804 memset(to, (unsigned char)fill_char, len);
11805 else {
11806 for (n = 0; n < len; ++n)
11807 PyUnicode_WRITE(kind, to, n, fill_char);
11808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 }
11810 else {
11811 /* number of characters copied this far */
11812 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011813 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 char *to = (char *) PyUnicode_DATA(u);
11815 Py_MEMCPY(to, PyUnicode_DATA(str),
11816 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 n = (done <= nchars-done) ? done : nchars-done;
11819 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011820 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 }
11823
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011824 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011825 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826}
11827
Alexander Belopolsky40018472011-02-26 01:02:56 +000011828PyObject *
11829PyUnicode_Replace(PyObject *obj,
11830 PyObject *subobj,
11831 PyObject *replobj,
11832 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833{
11834 PyObject *self;
11835 PyObject *str1;
11836 PyObject *str2;
11837 PyObject *result;
11838
11839 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011840 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011843 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 Py_DECREF(self);
11845 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 }
11847 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011848 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 Py_DECREF(self);
11850 Py_DECREF(str1);
11851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854 Py_DECREF(self);
11855 Py_DECREF(str1);
11856 Py_DECREF(str2);
11857 return result;
11858}
11859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011860PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011861 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862\n\
11863Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011864old replaced by new. If the optional argument count is\n\
11865given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866
11867static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 PyObject *str1;
11871 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011872 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 PyObject *result;
11874
Martin v. Löwis18e16552006-02-15 17:27:45 +000011875 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 str1 = PyUnicode_FromObject(str1);
11880 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11881 return NULL;
11882 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011883 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 Py_DECREF(str1);
11885 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
11888 result = replace(self, str1, str2, maxcount);
11889
11890 Py_DECREF(str1);
11891 Py_DECREF(str2);
11892 return result;
11893}
11894
Alexander Belopolsky40018472011-02-26 01:02:56 +000011895static PyObject *
11896unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011898 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 Py_ssize_t isize;
11900 Py_ssize_t osize, squote, dquote, i, o;
11901 Py_UCS4 max, quote;
11902 int ikind, okind;
11903 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011906 return NULL;
11907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 isize = PyUnicode_GET_LENGTH(unicode);
11909 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 /* Compute length of output, quote characters, and
11912 maximum character */
11913 osize = 2; /* quotes */
11914 max = 127;
11915 squote = dquote = 0;
11916 ikind = PyUnicode_KIND(unicode);
11917 for (i = 0; i < isize; i++) {
11918 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11919 switch (ch) {
11920 case '\'': squote++; osize++; break;
11921 case '"': dquote++; osize++; break;
11922 case '\\': case '\t': case '\r': case '\n':
11923 osize += 2; break;
11924 default:
11925 /* Fast-path ASCII */
11926 if (ch < ' ' || ch == 0x7f)
11927 osize += 4; /* \xHH */
11928 else if (ch < 0x7f)
11929 osize++;
11930 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11931 osize++;
11932 max = ch > max ? ch : max;
11933 }
11934 else if (ch < 0x100)
11935 osize += 4; /* \xHH */
11936 else if (ch < 0x10000)
11937 osize += 6; /* \uHHHH */
11938 else
11939 osize += 10; /* \uHHHHHHHH */
11940 }
11941 }
11942
11943 quote = '\'';
11944 if (squote) {
11945 if (dquote)
11946 /* Both squote and dquote present. Use squote,
11947 and escape them */
11948 osize += squote;
11949 else
11950 quote = '"';
11951 }
11952
11953 repr = PyUnicode_New(osize, max);
11954 if (repr == NULL)
11955 return NULL;
11956 okind = PyUnicode_KIND(repr);
11957 odata = PyUnicode_DATA(repr);
11958
11959 PyUnicode_WRITE(okind, odata, 0, quote);
11960 PyUnicode_WRITE(okind, odata, osize-1, quote);
11961
11962 for (i = 0, o = 1; i < isize; i++) {
11963 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011964
11965 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 if ((ch == quote) || (ch == '\\')) {
11967 PyUnicode_WRITE(okind, odata, o++, '\\');
11968 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011969 continue;
11970 }
11971
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011973 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 PyUnicode_WRITE(okind, odata, o++, '\\');
11975 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011976 }
11977 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 PyUnicode_WRITE(okind, odata, o++, '\\');
11979 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011980 }
11981 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 PyUnicode_WRITE(okind, odata, o++, '\\');
11983 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011984 }
11985
11986 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011987 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 PyUnicode_WRITE(okind, odata, o++, '\\');
11989 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011990 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11991 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992 }
11993
Georg Brandl559e5d72008-06-11 18:37:52 +000011994 /* Copy ASCII characters as-is */
11995 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011997 }
11998
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012000 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012001 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012002 (categories Z* and C* except ASCII space)
12003 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012005 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (ch <= 0xff) {
12007 PyUnicode_WRITE(okind, odata, o++, '\\');
12008 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012009 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12010 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012011 }
12012 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 else if (ch >= 0x10000) {
12014 PyUnicode_WRITE(okind, odata, o++, '\\');
12015 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12019 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12020 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12021 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012024 }
12025 /* Map 16-bit characters to '\uxxxx' */
12026 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 PyUnicode_WRITE(okind, odata, o++, '\\');
12028 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012029 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12030 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012033 }
12034 }
12035 /* Copy characters as-is */
12036 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012038 }
12039 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012042 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012043 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044}
12045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012046PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048\n\
12049Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012050such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051arguments start and end are interpreted as in slice notation.\n\
12052\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012053Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012058 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012059 Py_ssize_t start;
12060 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012061 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062
Jesus Ceaac451502011-04-20 17:09:23 +020012063 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12064 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (PyUnicode_READY(self) == -1)
12068 return NULL;
12069 if (PyUnicode_READY(substring) == -1)
12070 return NULL;
12071
Victor Stinner7931d9a2011-11-04 00:22:48 +010012072 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
12074 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (result == -2)
12077 return NULL;
12078
Christian Heimes217cfd12007-12-02 14:31:20 +000012079 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080}
12081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012083 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012085Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
12087static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012090 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012091 Py_ssize_t start;
12092 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012093 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
Jesus Ceaac451502011-04-20 17:09:23 +020012095 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12096 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (PyUnicode_READY(self) == -1)
12100 return NULL;
12101 if (PyUnicode_READY(substring) == -1)
12102 return NULL;
12103
Victor Stinner7931d9a2011-11-04 00:22:48 +010012104 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
12106 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (result == -2)
12109 return NULL;
12110
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111 if (result < 0) {
12112 PyErr_SetString(PyExc_ValueError, "substring not found");
12113 return NULL;
12114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115
Christian Heimes217cfd12007-12-02 14:31:20 +000012116 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117}
12118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012119PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012122Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012123done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124
12125static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012126unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012128 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 Py_UCS4 fillchar = ' ';
12130
Victor Stinnere9a29352011-10-01 02:14:59 +020012131 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012133
Victor Stinnerc4b49542011-12-11 22:44:26 +010012134 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135 return NULL;
12136
Victor Stinnerc4b49542011-12-11 22:44:26 +010012137 if (PyUnicode_GET_LENGTH(self) >= width)
12138 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139
Victor Stinnerc4b49542011-12-11 22:44:26 +010012140 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141}
12142
Alexander Belopolsky40018472011-02-26 01:02:56 +000012143PyObject *
12144PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145{
12146 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012147
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 s = PyUnicode_FromObject(s);
12149 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 if (sep != NULL) {
12152 sep = PyUnicode_FromObject(sep);
12153 if (sep == NULL) {
12154 Py_DECREF(s);
12155 return NULL;
12156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 }
12158
Victor Stinner9310abb2011-10-05 00:59:23 +020012159 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
12161 Py_DECREF(s);
12162 Py_XDECREF(sep);
12163 return result;
12164}
12165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012166PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012167 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168\n\
12169Return a list of the words in S, using sep as the\n\
12170delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012171splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012172whitespace string is a separator and empty strings are\n\
12173removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
12175static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012176unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
12178 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012179 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180
Martin v. Löwis18e16552006-02-15 17:27:45 +000012181 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182 return NULL;
12183
12184 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012187 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012189 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190}
12191
Thomas Wouters477c8d52006-05-27 19:21:47 +000012192PyObject *
12193PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12194{
12195 PyObject* str_obj;
12196 PyObject* sep_obj;
12197 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 int kind1, kind2, kind;
12199 void *buf1 = NULL, *buf2 = NULL;
12200 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012201
12202 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012203 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012204 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012205 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012207 Py_DECREF(str_obj);
12208 return NULL;
12209 }
12210
Victor Stinner14f8f022011-10-05 20:58:25 +020012211 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012213 kind = Py_MAX(kind1, kind2);
12214 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012216 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (!buf1)
12218 goto onError;
12219 buf2 = PyUnicode_DATA(sep_obj);
12220 if (kind2 != kind)
12221 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12222 if (!buf2)
12223 goto onError;
12224 len1 = PyUnicode_GET_LENGTH(str_obj);
12225 len2 = PyUnicode_GET_LENGTH(sep_obj);
12226
Victor Stinner14f8f022011-10-05 20:58:25 +020012227 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012229 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12230 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12231 else
12232 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 break;
12234 case PyUnicode_2BYTE_KIND:
12235 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12236 break;
12237 case PyUnicode_4BYTE_KIND:
12238 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12239 break;
12240 default:
12241 assert(0);
12242 out = 0;
12243 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012244
12245 Py_DECREF(sep_obj);
12246 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (kind1 != kind)
12248 PyMem_Free(buf1);
12249 if (kind2 != kind)
12250 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251
12252 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 onError:
12254 Py_DECREF(sep_obj);
12255 Py_DECREF(str_obj);
12256 if (kind1 != kind && buf1)
12257 PyMem_Free(buf1);
12258 if (kind2 != kind && buf2)
12259 PyMem_Free(buf2);
12260 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012261}
12262
12263
12264PyObject *
12265PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12266{
12267 PyObject* str_obj;
12268 PyObject* sep_obj;
12269 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 int kind1, kind2, kind;
12271 void *buf1 = NULL, *buf2 = NULL;
12272 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273
12274 str_obj = PyUnicode_FromObject(str_in);
12275 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012277 sep_obj = PyUnicode_FromObject(sep_in);
12278 if (!sep_obj) {
12279 Py_DECREF(str_obj);
12280 return NULL;
12281 }
12282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 kind1 = PyUnicode_KIND(str_in);
12284 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012285 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 buf1 = PyUnicode_DATA(str_in);
12287 if (kind1 != kind)
12288 buf1 = _PyUnicode_AsKind(str_in, kind);
12289 if (!buf1)
12290 goto onError;
12291 buf2 = PyUnicode_DATA(sep_obj);
12292 if (kind2 != kind)
12293 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12294 if (!buf2)
12295 goto onError;
12296 len1 = PyUnicode_GET_LENGTH(str_obj);
12297 len2 = PyUnicode_GET_LENGTH(sep_obj);
12298
12299 switch(PyUnicode_KIND(str_in)) {
12300 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012301 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12302 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12303 else
12304 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 break;
12306 case PyUnicode_2BYTE_KIND:
12307 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12308 break;
12309 case PyUnicode_4BYTE_KIND:
12310 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12311 break;
12312 default:
12313 assert(0);
12314 out = 0;
12315 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012316
12317 Py_DECREF(sep_obj);
12318 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 if (kind1 != kind)
12320 PyMem_Free(buf1);
12321 if (kind2 != kind)
12322 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323
12324 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 onError:
12326 Py_DECREF(sep_obj);
12327 Py_DECREF(str_obj);
12328 if (kind1 != kind && buf1)
12329 PyMem_Free(buf1);
12330 if (kind2 != kind && buf2)
12331 PyMem_Free(buf2);
12332 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012333}
12334
12335PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012337\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012338Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012339the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012340found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341
12342static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012343unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344{
Victor Stinner9310abb2011-10-05 00:59:23 +020012345 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012346}
12347
12348PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012349 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012351Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012353separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354
12355static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012356unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012357{
Victor Stinner9310abb2011-10-05 00:59:23 +020012358 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012359}
12360
Alexander Belopolsky40018472011-02-26 01:02:56 +000012361PyObject *
12362PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012363{
12364 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012365
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012366 s = PyUnicode_FromObject(s);
12367 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012368 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 if (sep != NULL) {
12370 sep = PyUnicode_FromObject(sep);
12371 if (sep == NULL) {
12372 Py_DECREF(s);
12373 return NULL;
12374 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012375 }
12376
Victor Stinner9310abb2011-10-05 00:59:23 +020012377 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012378
12379 Py_DECREF(s);
12380 Py_XDECREF(sep);
12381 return result;
12382}
12383
12384PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012386\n\
12387Return a list of the words in S, using sep as the\n\
12388delimiter string, starting at the end of the string and\n\
12389working to the front. If maxsplit is given, at most maxsplit\n\
12390splits are done. If sep is not specified, any whitespace string\n\
12391is a separator.");
12392
12393static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012394unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012395{
12396 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012397 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012398
Martin v. Löwis18e16552006-02-15 17:27:45 +000012399 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012400 return NULL;
12401
12402 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012404 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012405 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012406 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012407 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012408}
12409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012410PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412\n\
12413Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012414Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012415is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416
12417static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012418unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012420 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012421 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012423 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12424 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425 return NULL;
12426
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012427 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428}
12429
12430static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012431PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012433 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434}
12435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012436PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438\n\
12439Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012440and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441
12442static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012443unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445 return fixup(self, fixswapcase);
12446}
12447
Georg Brandlceee0772007-11-27 23:48:05 +000012448PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012450\n\
12451Return a translation table usable for str.translate().\n\
12452If there is only one argument, it must be a dictionary mapping Unicode\n\
12453ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012454Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012455If there are two arguments, they must be strings of equal length, and\n\
12456in the resulting dictionary, each character in x will be mapped to the\n\
12457character at the same position in y. If there is a third argument, it\n\
12458must be a string, whose characters will be mapped to None in the result.");
12459
12460static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012461unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012462{
12463 PyObject *x, *y = NULL, *z = NULL;
12464 PyObject *new = NULL, *key, *value;
12465 Py_ssize_t i = 0;
12466 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467
Georg Brandlceee0772007-11-27 23:48:05 +000012468 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12469 return NULL;
12470 new = PyDict_New();
12471 if (!new)
12472 return NULL;
12473 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 int x_kind, y_kind, z_kind;
12475 void *x_data, *y_data, *z_data;
12476
Georg Brandlceee0772007-11-27 23:48:05 +000012477 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012478 if (!PyUnicode_Check(x)) {
12479 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12480 "be a string if there is a second argument");
12481 goto err;
12482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012484 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12485 "arguments must have equal length");
12486 goto err;
12487 }
12488 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 x_kind = PyUnicode_KIND(x);
12490 y_kind = PyUnicode_KIND(y);
12491 x_data = PyUnicode_DATA(x);
12492 y_data = PyUnicode_DATA(y);
12493 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12494 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12495 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012496 if (!key || !value)
12497 goto err;
12498 res = PyDict_SetItem(new, key, value);
12499 Py_DECREF(key);
12500 Py_DECREF(value);
12501 if (res < 0)
12502 goto err;
12503 }
12504 /* create entries for deleting chars in z */
12505 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 z_kind = PyUnicode_KIND(z);
12507 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012508 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012510 if (!key)
12511 goto err;
12512 res = PyDict_SetItem(new, key, Py_None);
12513 Py_DECREF(key);
12514 if (res < 0)
12515 goto err;
12516 }
12517 }
12518 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 int kind;
12520 void *data;
12521
Georg Brandlceee0772007-11-27 23:48:05 +000012522 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012523 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012524 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12525 "to maketrans it must be a dict");
12526 goto err;
12527 }
12528 /* copy entries into the new dict, converting string keys to int keys */
12529 while (PyDict_Next(x, &i, &key, &value)) {
12530 if (PyUnicode_Check(key)) {
12531 /* convert string keys to integer keys */
12532 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012533 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012534 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12535 "table must be of length 1");
12536 goto err;
12537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 kind = PyUnicode_KIND(key);
12539 data = PyUnicode_DATA(key);
12540 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012541 if (!newkey)
12542 goto err;
12543 res = PyDict_SetItem(new, newkey, value);
12544 Py_DECREF(newkey);
12545 if (res < 0)
12546 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012547 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012548 /* just keep integer keys */
12549 if (PyDict_SetItem(new, key, value) < 0)
12550 goto err;
12551 } else {
12552 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12553 "be strings or integers");
12554 goto err;
12555 }
12556 }
12557 }
12558 return new;
12559 err:
12560 Py_DECREF(new);
12561 return NULL;
12562}
12563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012564PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566\n\
12567Return a copy of the string S, where all characters have been mapped\n\
12568through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012569Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012570Unmapped characters are left untouched. Characters mapped to None\n\
12571are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
12573static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012579PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012582Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
12584static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012585unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return fixup(self, fixupper);
12588}
12589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012590PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012593Pad a numeric string S with zeros on the left, to fill a field\n\
12594of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595
12596static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012597unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012599 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012600 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012601 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 int kind;
12603 void *data;
12604 Py_UCS4 chr;
12605
Martin v. Löwis18e16552006-02-15 17:27:45 +000012606 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607 return NULL;
12608
Victor Stinnerc4b49542011-12-11 22:44:26 +010012609 if (PyUnicode_READY(self) < 0)
12610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611
Victor Stinnerc4b49542011-12-11 22:44:26 +010012612 if (PyUnicode_GET_LENGTH(self) >= width)
12613 return unicode_result_unchanged(self);
12614
12615 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
12617 u = pad(self, fill, 0, '0');
12618
Walter Dörwald068325e2002-04-15 13:36:47 +000012619 if (u == NULL)
12620 return NULL;
12621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 kind = PyUnicode_KIND(u);
12623 data = PyUnicode_DATA(u);
12624 chr = PyUnicode_READ(kind, data, fill);
12625
12626 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 PyUnicode_WRITE(kind, data, 0, chr);
12629 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 }
12631
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012632 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012633 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
12636#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012637static PyObject *
12638unicode__decimal2ascii(PyObject *self)
12639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012641}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642#endif
12643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012644PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012647Return True if S starts with the specified prefix, False otherwise.\n\
12648With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012649With optional end, stop comparing S at that position.\n\
12650prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
12652static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012653unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012656 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012658 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012659 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012660 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
Jesus Ceaac451502011-04-20 17:09:23 +020012662 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012663 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012664 if (PyTuple_Check(subobj)) {
12665 Py_ssize_t i;
12666 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012667 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 if (substring == NULL)
12669 return NULL;
12670 result = tailmatch(self, substring, start, end, -1);
12671 Py_DECREF(substring);
12672 if (result) {
12673 Py_RETURN_TRUE;
12674 }
12675 }
12676 /* nothing matched */
12677 Py_RETURN_FALSE;
12678 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012679 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012680 if (substring == NULL) {
12681 if (PyErr_ExceptionMatches(PyExc_TypeError))
12682 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12683 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012685 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012686 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012688 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
12691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012692PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012695Return True if S ends with the specified suffix, False otherwise.\n\
12696With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697With optional end, stop comparing S at that position.\n\
12698suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699
12700static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012701unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012704 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012705 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012706 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012707 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012708 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
Jesus Ceaac451502011-04-20 17:09:23 +020012710 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012712 if (PyTuple_Check(subobj)) {
12713 Py_ssize_t i;
12714 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012715 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012717 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012719 result = tailmatch(self, substring, start, end, +1);
12720 Py_DECREF(substring);
12721 if (result) {
12722 Py_RETURN_TRUE;
12723 }
12724 }
12725 Py_RETURN_FALSE;
12726 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012727 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012728 if (substring == NULL) {
12729 if (PyErr_ExceptionMatches(PyExc_TypeError))
12730 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12731 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012733 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012734 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012736 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
12738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012740
12741PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012743\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012744Return a formatted version of S, using substitutions from args and kwargs.\n\
12745The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012746
Eric Smith27bbca62010-11-04 17:06:58 +000012747PyDoc_STRVAR(format_map__doc__,
12748 "S.format_map(mapping) -> str\n\
12749\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012750Return a formatted version of S, using substitutions from mapping.\n\
12751The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012752
Eric Smith4a7d76d2008-05-30 18:10:19 +000012753static PyObject *
12754unicode__format__(PyObject* self, PyObject* args)
12755{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012756 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012757
12758 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12759 return NULL;
12760
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012761 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012763 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012764}
12765
Eric Smith8c663262007-08-25 02:26:07 +000012766PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012768\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012769Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012770
12771static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012772unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 Py_ssize_t size;
12775
12776 /* If it's a compact object, account for base structure +
12777 character data. */
12778 if (PyUnicode_IS_COMPACT_ASCII(v))
12779 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12780 else if (PyUnicode_IS_COMPACT(v))
12781 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012782 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 else {
12784 /* If it is a two-block object, account for base object, and
12785 for character block if present. */
12786 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012787 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012789 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790 }
12791 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012792 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012793 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012795 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012796 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012797
12798 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012799}
12800
12801PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012803
12804static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012805unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012806{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012807 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 if (!copy)
12809 return NULL;
12810 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012811}
12812
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813static PyMethodDef unicode_methods[] = {
12814
12815 /* Order is according to common usage: often used methods should
12816 appear first, since lookup is done sequentially. */
12817
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012818 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012819 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12820 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012822 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12823 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12824 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12825 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12826 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12827 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12828 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012829 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012830 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12831 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12832 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012833 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012834 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12835 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12836 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012837 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012838 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012839 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012840 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012841 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12842 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12843 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12844 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12845 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12846 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12847 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12848 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12849 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12850 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12851 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12852 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12853 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12854 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012855 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012856 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012857 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012858 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012859 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012860 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012861 {"maketrans", (PyCFunction) unicode_maketrans,
12862 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012863 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012864#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012865 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866#endif
12867
12868#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012869 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012870 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871#endif
12872
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874 {NULL, NULL}
12875};
12876
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012877static PyObject *
12878unicode_mod(PyObject *v, PyObject *w)
12879{
Brian Curtindfc80e32011-08-10 20:28:54 -050012880 if (!PyUnicode_Check(v))
12881 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012883}
12884
12885static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 0, /*nb_add*/
12887 0, /*nb_subtract*/
12888 0, /*nb_multiply*/
12889 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012890};
12891
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012893 (lenfunc) unicode_length, /* sq_length */
12894 PyUnicode_Concat, /* sq_concat */
12895 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12896 (ssizeargfunc) unicode_getitem, /* sq_item */
12897 0, /* sq_slice */
12898 0, /* sq_ass_item */
12899 0, /* sq_ass_slice */
12900 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901};
12902
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012903static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012904unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 if (PyUnicode_READY(self) == -1)
12907 return NULL;
12908
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012909 if (PyIndex_Check(item)) {
12910 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012911 if (i == -1 && PyErr_Occurred())
12912 return NULL;
12913 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012915 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012916 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012917 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012918 PyObject *result;
12919 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012920 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012921 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012925 return NULL;
12926 }
12927
12928 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012929 Py_INCREF(unicode_empty);
12930 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010012932 slicelength == PyUnicode_GET_LENGTH(self)) {
12933 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000012934 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012935 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012936 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012937 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012938 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012939 src_kind = PyUnicode_KIND(self);
12940 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012941 if (!PyUnicode_IS_ASCII(self)) {
12942 kind_limit = kind_maxchar_limit(src_kind);
12943 max_char = 0;
12944 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12945 ch = PyUnicode_READ(src_kind, src_data, cur);
12946 if (ch > max_char) {
12947 max_char = ch;
12948 if (max_char >= kind_limit)
12949 break;
12950 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012951 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012952 }
Victor Stinner55c99112011-10-13 01:17:06 +020012953 else
12954 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012955 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012956 if (result == NULL)
12957 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012958 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012959 dest_data = PyUnicode_DATA(result);
12960
12961 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012962 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12963 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012964 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012965 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012966 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012967 } else {
12968 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12969 return NULL;
12970 }
12971}
12972
12973static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012974 (lenfunc)unicode_length, /* mp_length */
12975 (binaryfunc)unicode_subscript, /* mp_subscript */
12976 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012977};
12978
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980/* Helpers for PyUnicode_Format() */
12981
12982static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012983getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012985 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012987 (*p_argidx)++;
12988 if (arglen < 0)
12989 return args;
12990 else
12991 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 }
12993 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995 return NULL;
12996}
12997
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012998/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013000static PyObject *
13001formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013003 char *p;
13004 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013006
Guido van Rossumd57fd912000-03-10 22:53:23 +000013007 x = PyFloat_AsDouble(v);
13008 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013009 return NULL;
13010
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013013
Eric Smith0923d1d2009-04-16 20:16:10 +000013014 p = PyOS_double_to_string(x, type, prec,
13015 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013016 if (p == NULL)
13017 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013019 PyMem_Free(p);
13020 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013021}
13022
Tim Peters38fd5b62000-09-21 05:43:11 +000013023static PyObject*
13024formatlong(PyObject *val, int flags, int prec, int type)
13025{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 char *buf;
13027 int len;
13028 PyObject *str; /* temporary string object. */
13029 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013030
Benjamin Peterson14339b62009-01-31 16:36:08 +000013031 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13032 if (!str)
13033 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013035 Py_DECREF(str);
13036 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013037}
13038
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013039static Py_UCS4
13040formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013042 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013043 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013045 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013046 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013047 goto onError;
13048 }
13049 else {
13050 /* Integer input truncated to a character */
13051 long x;
13052 x = PyLong_AsLong(v);
13053 if (x == -1 && PyErr_Occurred())
13054 goto onError;
13055
Victor Stinner8faf8212011-12-08 22:14:11 +010013056 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013057 PyErr_SetString(PyExc_OverflowError,
13058 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013059 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 }
13061
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013062 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013064
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013066 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013068 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069}
13070
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013071static int
13072repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13073{
13074 int r;
13075 assert(count > 0);
13076 assert(PyUnicode_Check(obj));
13077 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013078 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013079 if (repeated == NULL)
13080 return -1;
13081 r = _PyAccu_Accumulate(acc, repeated);
13082 Py_DECREF(repeated);
13083 return r;
13084 }
13085 else {
13086 do {
13087 if (_PyAccu_Accumulate(acc, obj))
13088 return -1;
13089 } while (--count);
13090 return 0;
13091 }
13092}
13093
Alexander Belopolsky40018472011-02-26 01:02:56 +000013094PyObject *
13095PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013096{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 void *fmt;
13098 int fmtkind;
13099 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013101 int r;
13102 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013105 PyObject *temp = NULL;
13106 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013107 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013108 _PyAccu acc;
13109 static PyObject *plus, *minus, *blank, *zero, *percent;
13110
13111 if (!plus && !(plus = get_latin1_char('+')))
13112 return NULL;
13113 if (!minus && !(minus = get_latin1_char('-')))
13114 return NULL;
13115 if (!blank && !(blank = get_latin1_char(' ')))
13116 return NULL;
13117 if (!zero && !(zero = get_latin1_char('0')))
13118 return NULL;
13119 if (!percent && !(percent = get_latin1_char('%')))
13120 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013121
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 PyErr_BadInternalCall();
13124 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013126 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013129 if (_PyAccu_Init(&acc))
13130 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 fmt = PyUnicode_DATA(uformat);
13132 fmtkind = PyUnicode_KIND(uformat);
13133 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13134 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 arglen = PyTuple_Size(args);
13138 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 }
13140 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 arglen = -1;
13142 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013144 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013145 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147
13148 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013150 PyObject *nonfmt;
13151 Py_ssize_t nonfmtpos;
13152 nonfmtpos = fmtpos++;
13153 while (fmtcnt >= 0 &&
13154 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13155 fmtpos++;
13156 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013158 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013159 if (nonfmt == NULL)
13160 goto onError;
13161 r = _PyAccu_Accumulate(&acc, nonfmt);
13162 Py_DECREF(nonfmt);
13163 if (r)
13164 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013165 }
13166 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 /* Got a format specifier */
13168 int flags = 0;
13169 Py_ssize_t width = -1;
13170 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013172 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 int isnumok;
13174 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013175 void *pbuf = NULL;
13176 Py_ssize_t pindex, len;
13177 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 fmtpos++;
13180 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13181 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 Py_ssize_t keylen;
13183 PyObject *key;
13184 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013185
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 if (dict == NULL) {
13187 PyErr_SetString(PyExc_TypeError,
13188 "format requires a mapping");
13189 goto onError;
13190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013193 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 /* Skip over balanced parentheses */
13195 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013199 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013202 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 if (fmtcnt < 0 || pcount > 0) {
13204 PyErr_SetString(PyExc_ValueError,
13205 "incomplete format key");
13206 goto onError;
13207 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013208 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013209 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 if (key == NULL)
13211 goto onError;
13212 if (args_owned) {
13213 Py_DECREF(args);
13214 args_owned = 0;
13215 }
13216 args = PyObject_GetItem(dict, key);
13217 Py_DECREF(key);
13218 if (args == NULL) {
13219 goto onError;
13220 }
13221 args_owned = 1;
13222 arglen = -1;
13223 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 case '-': flags |= F_LJUST; continue;
13228 case '+': flags |= F_SIGN; continue;
13229 case ' ': flags |= F_BLANK; continue;
13230 case '#': flags |= F_ALT; continue;
13231 case '0': flags |= F_ZERO; continue;
13232 }
13233 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013234 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 if (c == '*') {
13236 v = getnextarg(args, arglen, &argidx);
13237 if (v == NULL)
13238 goto onError;
13239 if (!PyLong_Check(v)) {
13240 PyErr_SetString(PyExc_TypeError,
13241 "* wants int");
13242 goto onError;
13243 }
13244 width = PyLong_AsLong(v);
13245 if (width == -1 && PyErr_Occurred())
13246 goto onError;
13247 if (width < 0) {
13248 flags |= F_LJUST;
13249 width = -width;
13250 }
13251 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013252 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013253 }
13254 else if (c >= '0' && c <= '9') {
13255 width = c - '0';
13256 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 if (c < '0' || c > '9')
13259 break;
13260 if ((width*10) / 10 != width) {
13261 PyErr_SetString(PyExc_ValueError,
13262 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013264 }
13265 width = width*10 + (c - '0');
13266 }
13267 }
13268 if (c == '.') {
13269 prec = 0;
13270 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 if (c == '*') {
13273 v = getnextarg(args, arglen, &argidx);
13274 if (v == NULL)
13275 goto onError;
13276 if (!PyLong_Check(v)) {
13277 PyErr_SetString(PyExc_TypeError,
13278 "* wants int");
13279 goto onError;
13280 }
13281 prec = PyLong_AsLong(v);
13282 if (prec == -1 && PyErr_Occurred())
13283 goto onError;
13284 if (prec < 0)
13285 prec = 0;
13286 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013288 }
13289 else if (c >= '0' && c <= '9') {
13290 prec = c - '0';
13291 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 if (c < '0' || c > '9')
13294 break;
13295 if ((prec*10) / 10 != prec) {
13296 PyErr_SetString(PyExc_ValueError,
13297 "prec too big");
13298 goto onError;
13299 }
13300 prec = prec*10 + (c - '0');
13301 }
13302 }
13303 } /* prec */
13304 if (fmtcnt >= 0) {
13305 if (c == 'h' || c == 'l' || c == 'L') {
13306 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 }
13309 }
13310 if (fmtcnt < 0) {
13311 PyErr_SetString(PyExc_ValueError,
13312 "incomplete format");
13313 goto onError;
13314 }
13315 if (c != '%') {
13316 v = getnextarg(args, arglen, &argidx);
13317 if (v == NULL)
13318 goto onError;
13319 }
13320 sign = 0;
13321 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013322 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 switch (c) {
13324
13325 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013326 _PyAccu_Accumulate(&acc, percent);
13327 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013328
13329 case 's':
13330 case 'r':
13331 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013332 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 temp = v;
13334 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013335 }
13336 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013337 if (c == 's')
13338 temp = PyObject_Str(v);
13339 else if (c == 'r')
13340 temp = PyObject_Repr(v);
13341 else
13342 temp = PyObject_ASCII(v);
13343 if (temp == NULL)
13344 goto onError;
13345 if (PyUnicode_Check(temp))
13346 /* nothing to do */;
13347 else {
13348 Py_DECREF(temp);
13349 PyErr_SetString(PyExc_TypeError,
13350 "%s argument has non-string str()");
13351 goto onError;
13352 }
13353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354 if (PyUnicode_READY(temp) == -1) {
13355 Py_CLEAR(temp);
13356 goto onError;
13357 }
13358 pbuf = PyUnicode_DATA(temp);
13359 kind = PyUnicode_KIND(temp);
13360 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013361 if (prec >= 0 && len > prec)
13362 len = prec;
13363 break;
13364
13365 case 'i':
13366 case 'd':
13367 case 'u':
13368 case 'o':
13369 case 'x':
13370 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 isnumok = 0;
13372 if (PyNumber_Check(v)) {
13373 PyObject *iobj=NULL;
13374
13375 if (PyLong_Check(v)) {
13376 iobj = v;
13377 Py_INCREF(iobj);
13378 }
13379 else {
13380 iobj = PyNumber_Long(v);
13381 }
13382 if (iobj!=NULL) {
13383 if (PyLong_Check(iobj)) {
13384 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013385 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 Py_DECREF(iobj);
13387 if (!temp)
13388 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 if (PyUnicode_READY(temp) == -1) {
13390 Py_CLEAR(temp);
13391 goto onError;
13392 }
13393 pbuf = PyUnicode_DATA(temp);
13394 kind = PyUnicode_KIND(temp);
13395 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 sign = 1;
13397 }
13398 else {
13399 Py_DECREF(iobj);
13400 }
13401 }
13402 }
13403 if (!isnumok) {
13404 PyErr_Format(PyExc_TypeError,
13405 "%%%c format: a number is required, "
13406 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13407 goto onError;
13408 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013409 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013410 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 fillobj = zero;
13412 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 break;
13414
13415 case 'e':
13416 case 'E':
13417 case 'f':
13418 case 'F':
13419 case 'g':
13420 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013421 temp = formatfloat(v, flags, prec, c);
13422 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 if (PyUnicode_READY(temp) == -1) {
13425 Py_CLEAR(temp);
13426 goto onError;
13427 }
13428 pbuf = PyUnicode_DATA(temp);
13429 kind = PyUnicode_KIND(temp);
13430 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013431 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013432 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013433 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013434 fillobj = zero;
13435 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 break;
13437
13438 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013439 {
13440 Py_UCS4 ch = formatchar(v);
13441 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 temp = _PyUnicode_FromUCS4(&ch, 1);
13444 if (temp == NULL)
13445 goto onError;
13446 pbuf = PyUnicode_DATA(temp);
13447 kind = PyUnicode_KIND(temp);
13448 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013450 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013451
13452 default:
13453 PyErr_Format(PyExc_ValueError,
13454 "unsupported format character '%c' (0x%x) "
13455 "at index %zd",
13456 (31<=c && c<=126) ? (char)c : '?',
13457 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013458 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 goto onError;
13460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013461 /* pbuf is initialized here. */
13462 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013464 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13465 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013467 pindex++;
13468 }
13469 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13470 signobj = plus;
13471 len--;
13472 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 }
13474 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013475 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013476 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013477 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 else
13479 sign = 0;
13480 }
13481 if (width < len)
13482 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013484 if (fill != ' ') {
13485 assert(signobj != NULL);
13486 if (_PyAccu_Accumulate(&acc, signobj))
13487 goto onError;
13488 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 if (width > len)
13490 width--;
13491 }
13492 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013493 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013494 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013496 second = get_latin1_char(
13497 PyUnicode_READ(kind, pbuf, pindex + 1));
13498 pindex += 2;
13499 if (second == NULL ||
13500 _PyAccu_Accumulate(&acc, zero) ||
13501 _PyAccu_Accumulate(&acc, second))
13502 goto onError;
13503 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 width -= 2;
13506 if (width < 0)
13507 width = 0;
13508 len -= 2;
13509 }
13510 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013511 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013512 if (repeat_accumulate(&acc, fillobj, width - len))
13513 goto onError;
13514 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 }
13516 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013517 if (sign) {
13518 assert(signobj != NULL);
13519 if (_PyAccu_Accumulate(&acc, signobj))
13520 goto onError;
13521 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013523 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13524 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013525 second = get_latin1_char(
13526 PyUnicode_READ(kind, pbuf, pindex + 1));
13527 pindex += 2;
13528 if (second == NULL ||
13529 _PyAccu_Accumulate(&acc, zero) ||
13530 _PyAccu_Accumulate(&acc, second))
13531 goto onError;
13532 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013533 }
13534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013536 if (temp != NULL) {
13537 assert(pbuf == PyUnicode_DATA(temp));
13538 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013539 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 else {
13541 const char *p = (const char *) pbuf;
13542 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013543 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013544 v = PyUnicode_FromKindAndData(kind, p, len);
13545 }
13546 if (v == NULL)
13547 goto onError;
13548 r = _PyAccu_Accumulate(&acc, v);
13549 Py_DECREF(v);
13550 if (r)
13551 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013552 if (width > len && repeat_accumulate(&acc, blank, width - len))
13553 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 if (dict && (argidx < arglen) && c != '%') {
13555 PyErr_SetString(PyExc_TypeError,
13556 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 goto onError;
13558 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013559 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013561 } /* until end */
13562 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 PyErr_SetString(PyExc_TypeError,
13564 "not all arguments converted during string formatting");
13565 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013566 }
13567
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013568 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013571 }
13572 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013573 Py_XDECREF(temp);
13574 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013575 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013578 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013579 Py_XDECREF(temp);
13580 Py_XDECREF(second);
13581 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013582 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013584 }
13585 return NULL;
13586}
13587
Jeremy Hylton938ace62002-07-17 16:30:39 +000013588static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013589unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13590
Tim Peters6d6c1a32001-08-02 04:15:00 +000013591static PyObject *
13592unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13593{
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013595 static char *kwlist[] = {"object", "encoding", "errors", 0};
13596 char *encoding = NULL;
13597 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013598
Benjamin Peterson14339b62009-01-31 16:36:08 +000013599 if (type != &PyUnicode_Type)
13600 return unicode_subtype_new(type, args, kwds);
13601 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013604 if (x == NULL) {
13605 Py_INCREF(unicode_empty);
13606 return unicode_empty;
13607 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013608 if (encoding == NULL && errors == NULL)
13609 return PyObject_Str(x);
13610 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013612}
13613
Guido van Rossume023fe02001-08-30 03:12:59 +000013614static PyObject *
13615unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13616{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013617 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013618 Py_ssize_t length, char_size;
13619 int share_wstr, share_utf8;
13620 unsigned int kind;
13621 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013622
Benjamin Peterson14339b62009-01-31 16:36:08 +000013623 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013624
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013625 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013626 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013627 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013628 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013629 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013630 return NULL;
13631
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013632 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013633 if (self == NULL) {
13634 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013635 return NULL;
13636 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013637 kind = PyUnicode_KIND(unicode);
13638 length = PyUnicode_GET_LENGTH(unicode);
13639
13640 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013641#ifdef Py_DEBUG
13642 _PyUnicode_HASH(self) = -1;
13643#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013644 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013645#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013646 _PyUnicode_STATE(self).interned = 0;
13647 _PyUnicode_STATE(self).kind = kind;
13648 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013649 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013650 _PyUnicode_STATE(self).ready = 1;
13651 _PyUnicode_WSTR(self) = NULL;
13652 _PyUnicode_UTF8_LENGTH(self) = 0;
13653 _PyUnicode_UTF8(self) = NULL;
13654 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013655 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013656
13657 share_utf8 = 0;
13658 share_wstr = 0;
13659 if (kind == PyUnicode_1BYTE_KIND) {
13660 char_size = 1;
13661 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13662 share_utf8 = 1;
13663 }
13664 else if (kind == PyUnicode_2BYTE_KIND) {
13665 char_size = 2;
13666 if (sizeof(wchar_t) == 2)
13667 share_wstr = 1;
13668 }
13669 else {
13670 assert(kind == PyUnicode_4BYTE_KIND);
13671 char_size = 4;
13672 if (sizeof(wchar_t) == 4)
13673 share_wstr = 1;
13674 }
13675
13676 /* Ensure we won't overflow the length. */
13677 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13678 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013679 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013681 data = PyObject_MALLOC((length + 1) * char_size);
13682 if (data == NULL) {
13683 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 goto onError;
13685 }
13686
Victor Stinnerc3c74152011-10-02 20:39:55 +020013687 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013688 if (share_utf8) {
13689 _PyUnicode_UTF8_LENGTH(self) = length;
13690 _PyUnicode_UTF8(self) = data;
13691 }
13692 if (share_wstr) {
13693 _PyUnicode_WSTR_LENGTH(self) = length;
13694 _PyUnicode_WSTR(self) = (wchar_t *)data;
13695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013696
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013697 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013698 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013699 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013700#ifdef Py_DEBUG
13701 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13702#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013703 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013704 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013705
13706onError:
13707 Py_DECREF(unicode);
13708 Py_DECREF(self);
13709 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013710}
13711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013712PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013714\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013715Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013716encoding defaults to the current default string encoding.\n\
13717errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013718
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013719static PyObject *unicode_iter(PyObject *seq);
13720
Guido van Rossumd57fd912000-03-10 22:53:23 +000013721PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013722 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013723 "str", /* tp_name */
13724 sizeof(PyUnicodeObject), /* tp_size */
13725 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 (destructor)unicode_dealloc, /* tp_dealloc */
13728 0, /* tp_print */
13729 0, /* tp_getattr */
13730 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013731 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 unicode_repr, /* tp_repr */
13733 &unicode_as_number, /* tp_as_number */
13734 &unicode_as_sequence, /* tp_as_sequence */
13735 &unicode_as_mapping, /* tp_as_mapping */
13736 (hashfunc) unicode_hash, /* tp_hash*/
13737 0, /* tp_call*/
13738 (reprfunc) unicode_str, /* tp_str */
13739 PyObject_GenericGetAttr, /* tp_getattro */
13740 0, /* tp_setattro */
13741 0, /* tp_as_buffer */
13742 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 unicode_doc, /* tp_doc */
13745 0, /* tp_traverse */
13746 0, /* tp_clear */
13747 PyUnicode_RichCompare, /* tp_richcompare */
13748 0, /* tp_weaklistoffset */
13749 unicode_iter, /* tp_iter */
13750 0, /* tp_iternext */
13751 unicode_methods, /* tp_methods */
13752 0, /* tp_members */
13753 0, /* tp_getset */
13754 &PyBaseObject_Type, /* tp_base */
13755 0, /* tp_dict */
13756 0, /* tp_descr_get */
13757 0, /* tp_descr_set */
13758 0, /* tp_dictoffset */
13759 0, /* tp_init */
13760 0, /* tp_alloc */
13761 unicode_new, /* tp_new */
13762 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763};
13764
13765/* Initialize the Unicode implementation */
13766
Victor Stinner3a50e702011-10-18 21:21:00 +020013767int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013769 int i;
13770
Thomas Wouters477c8d52006-05-27 19:21:47 +000013771 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013772 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013773 0x000A, /* LINE FEED */
13774 0x000D, /* CARRIAGE RETURN */
13775 0x001C, /* FILE SEPARATOR */
13776 0x001D, /* GROUP SEPARATOR */
13777 0x001E, /* RECORD SEPARATOR */
13778 0x0085, /* NEXT LINE */
13779 0x2028, /* LINE SEPARATOR */
13780 0x2029, /* PARAGRAPH SEPARATOR */
13781 };
13782
Fred Drakee4315f52000-05-09 19:53:39 +000013783 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013784 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013785 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013787 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013788
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013789 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013791 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013793
13794 /* initialize the linebreak bloom filter */
13795 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013797 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013798
13799 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013800
13801#ifdef HAVE_MBCS
13802 winver.dwOSVersionInfoSize = sizeof(winver);
13803 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13804 PyErr_SetFromWindowsErr(0);
13805 return -1;
13806 }
13807#endif
13808 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013809}
13810
13811/* Finalize the Unicode implementation */
13812
Christian Heimesa156e092008-02-16 07:38:31 +000013813int
13814PyUnicode_ClearFreeList(void)
13815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013817}
13818
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819void
Thomas Wouters78890102000-07-22 19:25:51 +000013820_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013822 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013824 Py_XDECREF(unicode_empty);
13825 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013826
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013827 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 if (unicode_latin1[i]) {
13829 Py_DECREF(unicode_latin1[i]);
13830 unicode_latin1[i] = NULL;
13831 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013832 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013833 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013834 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013836
Walter Dörwald16807132007-05-25 13:52:07 +000013837void
13838PyUnicode_InternInPlace(PyObject **p)
13839{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013840 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013842#ifdef Py_DEBUG
13843 assert(s != NULL);
13844 assert(_PyUnicode_CHECK(s));
13845#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013847 return;
13848#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013849 /* If it's a subclass, we don't really know what putting
13850 it in the interned dict might do. */
13851 if (!PyUnicode_CheckExact(s))
13852 return;
13853 if (PyUnicode_CHECK_INTERNED(s))
13854 return;
13855 if (interned == NULL) {
13856 interned = PyDict_New();
13857 if (interned == NULL) {
13858 PyErr_Clear(); /* Don't leave an exception */
13859 return;
13860 }
13861 }
13862 /* It might be that the GetItem call fails even
13863 though the key is present in the dictionary,
13864 namely when this happens during a stack overflow. */
13865 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013866 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013867 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013868
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 if (t) {
13870 Py_INCREF(t);
13871 Py_DECREF(*p);
13872 *p = t;
13873 return;
13874 }
Walter Dörwald16807132007-05-25 13:52:07 +000013875
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013877 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013878 PyErr_Clear();
13879 PyThreadState_GET()->recursion_critical = 0;
13880 return;
13881 }
13882 PyThreadState_GET()->recursion_critical = 0;
13883 /* The two references in interned are not counted by refcnt.
13884 The deallocator will take care of this */
13885 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013887}
13888
13889void
13890PyUnicode_InternImmortal(PyObject **p)
13891{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 PyUnicode_InternInPlace(p);
13893 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013894 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 Py_INCREF(*p);
13896 }
Walter Dörwald16807132007-05-25 13:52:07 +000013897}
13898
13899PyObject *
13900PyUnicode_InternFromString(const char *cp)
13901{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 PyObject *s = PyUnicode_FromString(cp);
13903 if (s == NULL)
13904 return NULL;
13905 PyUnicode_InternInPlace(&s);
13906 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013907}
13908
Alexander Belopolsky40018472011-02-26 01:02:56 +000013909void
13910_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013911{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013912 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013914 Py_ssize_t i, n;
13915 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013916
Benjamin Peterson14339b62009-01-31 16:36:08 +000013917 if (interned == NULL || !PyDict_Check(interned))
13918 return;
13919 keys = PyDict_Keys(interned);
13920 if (keys == NULL || !PyList_Check(keys)) {
13921 PyErr_Clear();
13922 return;
13923 }
Walter Dörwald16807132007-05-25 13:52:07 +000013924
Benjamin Peterson14339b62009-01-31 16:36:08 +000013925 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13926 detector, interned unicode strings are not forcibly deallocated;
13927 rather, we give them their stolen references back, and then clear
13928 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013929
Benjamin Peterson14339b62009-01-31 16:36:08 +000013930 n = PyList_GET_SIZE(keys);
13931 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013933 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013934 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013935 if (PyUnicode_READY(s) == -1) {
13936 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013937 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013939 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 case SSTATE_NOT_INTERNED:
13941 /* XXX Shouldn't happen */
13942 break;
13943 case SSTATE_INTERNED_IMMORTAL:
13944 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013946 break;
13947 case SSTATE_INTERNED_MORTAL:
13948 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 break;
13951 default:
13952 Py_FatalError("Inconsistent interned string state.");
13953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 }
13956 fprintf(stderr, "total size of all interned strings: "
13957 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13958 "mortal/immortal\n", mortal_size, immortal_size);
13959 Py_DECREF(keys);
13960 PyDict_Clear(interned);
13961 Py_DECREF(interned);
13962 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013963}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013964
13965
13966/********************* Unicode Iterator **************************/
13967
13968typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 PyObject_HEAD
13970 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013971 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972} unicodeiterobject;
13973
13974static void
13975unicodeiter_dealloc(unicodeiterobject *it)
13976{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013977 _PyObject_GC_UNTRACK(it);
13978 Py_XDECREF(it->it_seq);
13979 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013980}
13981
13982static int
13983unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13984{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 Py_VISIT(it->it_seq);
13986 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013987}
13988
13989static PyObject *
13990unicodeiter_next(unicodeiterobject *it)
13991{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013993
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 assert(it != NULL);
13995 seq = it->it_seq;
13996 if (seq == NULL)
13997 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013998 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014000 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14001 int kind = PyUnicode_KIND(seq);
14002 void *data = PyUnicode_DATA(seq);
14003 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14004 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 if (item != NULL)
14006 ++it->it_index;
14007 return item;
14008 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014009
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 Py_DECREF(seq);
14011 it->it_seq = NULL;
14012 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014013}
14014
14015static PyObject *
14016unicodeiter_len(unicodeiterobject *it)
14017{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 Py_ssize_t len = 0;
14019 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014020 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014022}
14023
14024PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14025
14026static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014027 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014030};
14031
14032PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014033 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14034 "str_iterator", /* tp_name */
14035 sizeof(unicodeiterobject), /* tp_basicsize */
14036 0, /* tp_itemsize */
14037 /* methods */
14038 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14039 0, /* tp_print */
14040 0, /* tp_getattr */
14041 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014042 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 0, /* tp_repr */
14044 0, /* tp_as_number */
14045 0, /* tp_as_sequence */
14046 0, /* tp_as_mapping */
14047 0, /* tp_hash */
14048 0, /* tp_call */
14049 0, /* tp_str */
14050 PyObject_GenericGetAttr, /* tp_getattro */
14051 0, /* tp_setattro */
14052 0, /* tp_as_buffer */
14053 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14054 0, /* tp_doc */
14055 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14056 0, /* tp_clear */
14057 0, /* tp_richcompare */
14058 0, /* tp_weaklistoffset */
14059 PyObject_SelfIter, /* tp_iter */
14060 (iternextfunc)unicodeiter_next, /* tp_iternext */
14061 unicodeiter_methods, /* tp_methods */
14062 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014063};
14064
14065static PyObject *
14066unicode_iter(PyObject *seq)
14067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014068 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014069
Benjamin Peterson14339b62009-01-31 16:36:08 +000014070 if (!PyUnicode_Check(seq)) {
14071 PyErr_BadInternalCall();
14072 return NULL;
14073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014074 if (PyUnicode_READY(seq) == -1)
14075 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14077 if (it == NULL)
14078 return NULL;
14079 it->it_index = 0;
14080 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014081 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 _PyObject_GC_TRACK(it);
14083 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014084}
14085
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014086
14087size_t
14088Py_UNICODE_strlen(const Py_UNICODE *u)
14089{
14090 int res = 0;
14091 while(*u++)
14092 res++;
14093 return res;
14094}
14095
14096Py_UNICODE*
14097Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14098{
14099 Py_UNICODE *u = s1;
14100 while ((*u++ = *s2++));
14101 return s1;
14102}
14103
14104Py_UNICODE*
14105Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14106{
14107 Py_UNICODE *u = s1;
14108 while ((*u++ = *s2++))
14109 if (n-- == 0)
14110 break;
14111 return s1;
14112}
14113
14114Py_UNICODE*
14115Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14116{
14117 Py_UNICODE *u1 = s1;
14118 u1 += Py_UNICODE_strlen(u1);
14119 Py_UNICODE_strcpy(u1, s2);
14120 return s1;
14121}
14122
14123int
14124Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14125{
14126 while (*s1 && *s2 && *s1 == *s2)
14127 s1++, s2++;
14128 if (*s1 && *s2)
14129 return (*s1 < *s2) ? -1 : +1;
14130 if (*s1)
14131 return 1;
14132 if (*s2)
14133 return -1;
14134 return 0;
14135}
14136
14137int
14138Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14139{
14140 register Py_UNICODE u1, u2;
14141 for (; n != 0; n--) {
14142 u1 = *s1;
14143 u2 = *s2;
14144 if (u1 != u2)
14145 return (u1 < u2) ? -1 : +1;
14146 if (u1 == '\0')
14147 return 0;
14148 s1++;
14149 s2++;
14150 }
14151 return 0;
14152}
14153
14154Py_UNICODE*
14155Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14156{
14157 const Py_UNICODE *p;
14158 for (p = s; *p; p++)
14159 if (*p == c)
14160 return (Py_UNICODE*)p;
14161 return NULL;
14162}
14163
14164Py_UNICODE*
14165Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14166{
14167 const Py_UNICODE *p;
14168 p = s + Py_UNICODE_strlen(s);
14169 while (p != s) {
14170 p--;
14171 if (*p == c)
14172 return (Py_UNICODE*)p;
14173 }
14174 return NULL;
14175}
Victor Stinner331ea922010-08-10 16:37:20 +000014176
Victor Stinner71133ff2010-09-01 23:43:53 +000014177Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014178PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014179{
Victor Stinner577db2c2011-10-11 22:12:48 +020014180 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014181 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 if (!PyUnicode_Check(unicode)) {
14184 PyErr_BadArgument();
14185 return NULL;
14186 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014187 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014188 if (u == NULL)
14189 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014190 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014191 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014192 PyErr_NoMemory();
14193 return NULL;
14194 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014195 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014196 size *= sizeof(Py_UNICODE);
14197 copy = PyMem_Malloc(size);
14198 if (copy == NULL) {
14199 PyErr_NoMemory();
14200 return NULL;
14201 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014202 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014203 return copy;
14204}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014205
Georg Brandl66c221e2010-10-14 07:04:07 +000014206/* A _string module, to export formatter_parser and formatter_field_name_split
14207 to the string.Formatter class implemented in Python. */
14208
14209static PyMethodDef _string_methods[] = {
14210 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14211 METH_O, PyDoc_STR("split the argument as a field name")},
14212 {"formatter_parser", (PyCFunction) formatter_parser,
14213 METH_O, PyDoc_STR("parse the argument as a format string")},
14214 {NULL, NULL}
14215};
14216
14217static struct PyModuleDef _string_module = {
14218 PyModuleDef_HEAD_INIT,
14219 "_string",
14220 PyDoc_STR("string helper module"),
14221 0,
14222 _string_methods,
14223 NULL,
14224 NULL,
14225 NULL,
14226 NULL
14227};
14228
14229PyMODINIT_FUNC
14230PyInit__string(void)
14231{
14232 return PyModule_Create(&_string_module);
14233}
14234
14235
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014236#ifdef __cplusplus
14237}
14238#endif