blob: b3d6de219607478c8bd36de2a2c61591db6d242c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100226static int unicode_modifiable(PyObject *unicode);
227
Victor Stinnerfe226c02011-10-03 03:52:20 +0200228
Alexander Belopolsky40018472011-02-26 01:02:56 +0000229static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200230unicode_fromascii(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
377 void *data = PyUnicode_DATA(ascii);
378 for (i=0; i < ascii->length; i++)
379 {
380 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
381 if (ch > maxchar)
382 maxchar = ch;
383 }
384 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100385 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100387 assert(maxchar <= 255);
388 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 else
390 assert(maxchar < 128);
391 }
Victor Stinner77faf692011-11-20 18:56:05 +0100392 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100394 assert(maxchar <= 0xFFFF);
395 }
396 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100398 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinnerc4b49542011-12-11 22:44:26 +0100489static PyObject*
490unicode_result_unchanged(PyObject *unicode)
491{
492 if (PyUnicode_CheckExact(unicode)) {
493 if (PyUnicode_READY(unicode) < 0)
494 return NULL;
495 Py_INCREF(unicode);
496 return unicode;
497 }
498 else
499 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100500 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100501}
502
Victor Stinner3a50e702011-10-18 21:21:00 +0200503#ifdef HAVE_MBCS
504static OSVERSIONINFOEX winver;
505#endif
506
Thomas Wouters477c8d52006-05-27 19:21:47 +0000507/* --- Bloom Filters ----------------------------------------------------- */
508
509/* stuff to implement simple "bloom filters" for Unicode characters.
510 to keep things simple, we use a single bitmask, using the least 5
511 bits from each unicode characters as the bit index. */
512
513/* the linebreak mask is set up by Unicode_Init below */
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#if LONG_BIT >= 128
516#define BLOOM_WIDTH 128
517#elif LONG_BIT >= 64
518#define BLOOM_WIDTH 64
519#elif LONG_BIT >= 32
520#define BLOOM_WIDTH 32
521#else
522#error "LONG_BIT is smaller than 32"
523#endif
524
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525#define BLOOM_MASK unsigned long
526
527static BLOOM_MASK bloom_linebreak;
528
Antoine Pitrouf068f942010-01-13 14:19:12 +0000529#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
530#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000531
Benjamin Peterson29060642009-01-31 22:14:21 +0000532#define BLOOM_LINEBREAK(ch) \
533 ((ch) < 128U ? ascii_linebreak[(ch)] : \
534 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Alexander Belopolsky40018472011-02-26 01:02:56 +0000536Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000538{
539 /* calculate simple bloom-style bitmask for a given unicode string */
540
Antoine Pitrouf068f942010-01-13 14:19:12 +0000541 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542 Py_ssize_t i;
543
544 mask = 0;
545 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
548 return mask;
549}
550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200551#define BLOOM_MEMBER(mask, chr, str) \
552 (BLOOM(mask, chr) \
553 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200555/* Compilation of templated routines */
556
557#include "stringlib/asciilib.h"
558#include "stringlib/fastsearch.h"
559#include "stringlib/partition.h"
560#include "stringlib/split.h"
561#include "stringlib/count.h"
562#include "stringlib/find.h"
563#include "stringlib/find_max_char.h"
564#include "stringlib/localeutil.h"
565#include "stringlib/undef.h"
566
567#include "stringlib/ucs1lib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs2lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs4lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200597#include "stringlib/unicodedefs.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/count.h"
600#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100601#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603/* --- Unicode Object ----------------------------------------------------- */
604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200605static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200606fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200607
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200608Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
609 Py_ssize_t size, Py_UCS4 ch,
610 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
613
614 switch (kind) {
615 case PyUnicode_1BYTE_KIND:
616 {
617 Py_UCS1 ch1 = (Py_UCS1) ch;
618 if (ch1 == ch)
619 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
620 else
621 return -1;
622 }
623 case PyUnicode_2BYTE_KIND:
624 {
625 Py_UCS2 ch2 = (Py_UCS2) ch;
626 if (ch2 == ch)
627 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
628 else
629 return -1;
630 }
631 case PyUnicode_4BYTE_KIND:
632 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
633 default:
634 assert(0);
635 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637}
638
Victor Stinnerfe226c02011-10-03 03:52:20 +0200639static PyObject*
640resize_compact(PyObject *unicode, Py_ssize_t length)
641{
642 Py_ssize_t char_size;
643 Py_ssize_t struct_size;
644 Py_ssize_t new_size;
645 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100646 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200647 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100648 assert(PyUnicode_IS_COMPACT(unicode));
649
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200650 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100651 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 struct_size = sizeof(PyASCIIObject);
653 else
654 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200656
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
658 PyErr_NoMemory();
659 return NULL;
660 }
661 new_size = (struct_size + (length + 1) * char_size);
662
Victor Stinner84def372011-12-11 20:04:56 +0100663 _Py_DEC_REFTOTAL;
664 _Py_ForgetReference(unicode);
665
666 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
667 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100668 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200669 PyErr_NoMemory();
670 return NULL;
671 }
Victor Stinner84def372011-12-11 20:04:56 +0100672 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100674
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200676 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100678 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 _PyUnicode_WSTR_LENGTH(unicode) = length;
680 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
682 length, 0);
683 return unicode;
684}
685
Alexander Belopolsky40018472011-02-26 01:02:56 +0000686static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200687resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000688{
Victor Stinner95663112011-10-04 01:03:50 +0200689 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100690 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000693
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 if (PyUnicode_IS_READY(unicode)) {
695 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200696 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 void *data;
698
699 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200700 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200701 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
702 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703
704 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
705 PyErr_NoMemory();
706 return -1;
707 }
708 new_size = (length + 1) * char_size;
709
Victor Stinner7a9105a2011-12-12 00:13:42 +0100710 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
711 {
712 PyObject_DEL(_PyUnicode_UTF8(unicode));
713 _PyUnicode_UTF8(unicode) = NULL;
714 _PyUnicode_UTF8_LENGTH(unicode) = 0;
715 }
716
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 data = (PyObject *)PyObject_REALLOC(data, new_size);
718 if (data == NULL) {
719 PyErr_NoMemory();
720 return -1;
721 }
722 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200723 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200725 _PyUnicode_WSTR_LENGTH(unicode) = length;
726 }
727 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 _PyUnicode_UTF8_LENGTH(unicode) = length;
730 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 _PyUnicode_LENGTH(unicode) = length;
732 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200733 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200734 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 }
Victor Stinner95663112011-10-04 01:03:50 +0200738 assert(_PyUnicode_WSTR(unicode) != NULL);
739
740 /* check for integer overflow */
741 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
742 PyErr_NoMemory();
743 return -1;
744 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200746 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100747 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200748 if (!wstr) {
749 PyErr_NoMemory();
750 return -1;
751 }
752 _PyUnicode_WSTR(unicode) = wstr;
753 _PyUnicode_WSTR(unicode)[length] = 0;
754 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200755 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000756 return 0;
757}
758
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759static PyObject*
760resize_copy(PyObject *unicode, Py_ssize_t length)
761{
762 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100763 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765
766 if (PyUnicode_READY(unicode) < 0)
767 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768
769 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
770 if (copy == NULL)
771 return NULL;
772
773 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200774 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200776 }
777 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200778 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 if (w == NULL)
782 return NULL;
783 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
784 copy_length = Py_MIN(copy_length, length);
785 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
786 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200787 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 }
789}
790
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000792 Ux0000 terminated; some code (e.g. new_identifier)
793 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000796 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798*/
799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200800#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200801static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802#endif
803
Alexander Belopolsky40018472011-02-26 01:02:56 +0000804static PyUnicodeObject *
805_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806{
807 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Thomas Wouters477c8d52006-05-27 19:21:47 +0000810 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (length == 0 && unicode_empty != NULL) {
812 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200813 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000816 /* Ensure we won't overflow the size. */
817 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
818 return (PyUnicodeObject *)PyErr_NoMemory();
819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 if (length < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to _PyUnicode_New");
823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826#ifdef Py_DEBUG
827 ++unicode_old_new_calls;
828#endif
829
830 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
831 if (unicode == NULL)
832 return NULL;
833 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
834 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
835 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000837 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100838 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840
Jeremy Hyltond8082792003-09-16 19:41:39 +0000841 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000842 * the caller fails before initializing str -- unicode_resize()
843 * reads str[0], and the Keep-Alive optimization can keep memory
844 * allocated for str alive across a call to unicode_dealloc(unicode).
845 * We don't want unicode_resize to read uninitialized memory in
846 * that case.
847 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200848 _PyUnicode_WSTR(unicode)[0] = 0;
849 _PyUnicode_WSTR(unicode)[length] = 0;
850 _PyUnicode_WSTR_LENGTH(unicode) = length;
851 _PyUnicode_HASH(unicode) = -1;
852 _PyUnicode_STATE(unicode).interned = 0;
853 _PyUnicode_STATE(unicode).kind = 0;
854 _PyUnicode_STATE(unicode).compact = 0;
855 _PyUnicode_STATE(unicode).ready = 0;
856 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200857 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200859 _PyUnicode_UTF8(unicode) = NULL;
860 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100861 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 return unicode;
863}
864
Victor Stinnerf42dc442011-10-02 23:33:16 +0200865static const char*
866unicode_kind_name(PyObject *unicode)
867{
Victor Stinner42dfd712011-10-03 14:41:45 +0200868 /* don't check consistency: unicode_kind_name() is called from
869 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200870 if (!PyUnicode_IS_COMPACT(unicode))
871 {
872 if (!PyUnicode_IS_READY(unicode))
873 return "wstr";
874 switch(PyUnicode_KIND(unicode))
875 {
876 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200877 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200878 return "legacy ascii";
879 else
880 return "legacy latin1";
881 case PyUnicode_2BYTE_KIND:
882 return "legacy UCS2";
883 case PyUnicode_4BYTE_KIND:
884 return "legacy UCS4";
885 default:
886 return "<legacy invalid kind>";
887 }
888 }
889 assert(PyUnicode_IS_READY(unicode));
890 switch(PyUnicode_KIND(unicode))
891 {
892 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 return "ascii";
895 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200896 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200897 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200900 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 default:
902 return "<invalid compact kind>";
903 }
904}
905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200906#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200907static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909/* Functions wrapping macros for use in debugger */
910char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200911 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912}
913
914void *_PyUnicode_compact_data(void *unicode) {
915 return _PyUnicode_COMPACT_DATA(unicode);
916}
917void *_PyUnicode_data(void *unicode){
918 printf("obj %p\n", unicode);
919 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
920 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
921 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
922 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
923 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
924 return PyUnicode_DATA(unicode);
925}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200926
927void
928_PyUnicode_Dump(PyObject *op)
929{
930 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
932 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
933 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200936 {
937 if (ascii->state.ascii)
938 data = (ascii + 1);
939 else
940 data = (compact + 1);
941 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 else
943 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200944 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->wstr == data)
947 printf("shared ");
948 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200949
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(" (%zu), ", compact->wstr_length);
952 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
953 printf("shared ");
954 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200957}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200958#endif
959
960PyObject *
961PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
962{
963 PyObject *obj;
964 PyCompactUnicodeObject *unicode;
965 void *data;
966 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200967 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 Py_ssize_t char_size;
969 Py_ssize_t struct_size;
970
971 /* Optimization for empty strings */
972 if (size == 0 && unicode_empty != NULL) {
973 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200974 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 }
976
977#ifdef Py_DEBUG
978 ++unicode_new_new_calls;
979#endif
980
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 is_ascii = 0;
982 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 struct_size = sizeof(PyCompactUnicodeObject);
984 if (maxchar < 128) {
985 kind_state = PyUnicode_1BYTE_KIND;
986 char_size = 1;
987 is_ascii = 1;
988 struct_size = sizeof(PyASCIIObject);
989 }
990 else if (maxchar < 256) {
991 kind_state = PyUnicode_1BYTE_KIND;
992 char_size = 1;
993 }
994 else if (maxchar < 65536) {
995 kind_state = PyUnicode_2BYTE_KIND;
996 char_size = 2;
997 if (sizeof(wchar_t) == 2)
998 is_sharing = 1;
999 }
1000 else {
1001 kind_state = PyUnicode_4BYTE_KIND;
1002 char_size = 4;
1003 if (sizeof(wchar_t) == 4)
1004 is_sharing = 1;
1005 }
1006
1007 /* Ensure we won't overflow the size. */
1008 if (size < 0) {
1009 PyErr_SetString(PyExc_SystemError,
1010 "Negative size passed to PyUnicode_New");
1011 return NULL;
1012 }
1013 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1014 return PyErr_NoMemory();
1015
1016 /* Duplicated allocation code from _PyObject_New() instead of a call to
1017 * PyObject_New() so we are able to allocate space for the object and
1018 * it's data buffer.
1019 */
1020 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1021 if (obj == NULL)
1022 return PyErr_NoMemory();
1023 obj = PyObject_INIT(obj, &PyUnicode_Type);
1024 if (obj == NULL)
1025 return NULL;
1026
1027 unicode = (PyCompactUnicodeObject *)obj;
1028 if (is_ascii)
1029 data = ((PyASCIIObject*)obj) + 1;
1030 else
1031 data = unicode + 1;
1032 _PyUnicode_LENGTH(unicode) = size;
1033 _PyUnicode_HASH(unicode) = -1;
1034 _PyUnicode_STATE(unicode).interned = 0;
1035 _PyUnicode_STATE(unicode).kind = kind_state;
1036 _PyUnicode_STATE(unicode).compact = 1;
1037 _PyUnicode_STATE(unicode).ready = 1;
1038 _PyUnicode_STATE(unicode).ascii = is_ascii;
1039 if (is_ascii) {
1040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 }
1043 else if (kind_state == PyUnicode_1BYTE_KIND) {
1044 ((char*)data)[size] = 0;
1045 _PyUnicode_WSTR(unicode) = NULL;
1046 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 }
1050 else {
1051 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 if (kind_state == PyUnicode_2BYTE_KIND)
1054 ((Py_UCS2*)data)[size] = 0;
1055 else /* kind_state == PyUnicode_4BYTE_KIND */
1056 ((Py_UCS4*)data)[size] = 0;
1057 if (is_sharing) {
1058 _PyUnicode_WSTR_LENGTH(unicode) = size;
1059 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1060 }
1061 else {
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1063 _PyUnicode_WSTR(unicode) = NULL;
1064 }
1065 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001066 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return obj;
1068}
1069
1070#if SIZEOF_WCHAR_T == 2
1071/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1072 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001073 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074
1075 This function assumes that unicode can hold one more code point than wstr
1076 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001077static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001079 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080{
1081 const wchar_t *iter;
1082 Py_UCS4 *ucs4_out;
1083
Victor Stinner910337b2011-10-03 03:20:16 +02001084 assert(unicode != NULL);
1085 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1087 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1088
1089 for (iter = begin; iter < end; ) {
1090 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1091 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001092 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1093 && (iter+1) < end
1094 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 {
Victor Stinner551ac952011-11-29 22:58:13 +01001096 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 iter += 2;
1098 }
1099 else {
1100 *ucs4_out++ = *iter;
1101 iter++;
1102 }
1103 }
1104 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1105 _PyUnicode_GET_LENGTH(unicode)));
1106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107}
1108#endif
1109
Victor Stinnercd9950f2011-10-02 00:34:53 +02001110static int
Victor Stinner488fa492011-12-12 00:01:39 +01001111unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112{
Victor Stinner488fa492011-12-12 00:01:39 +01001113 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001114 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001115 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001116 return -1;
1117 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
Victor Stinner488fa492011-12-12 00:01:39 +01001283 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001308 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001309
Victor Stinnerc53be962011-10-02 21:33:54 +02001310 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 *num_surrogates = 0;
1312 *maxchar = 0;
1313
1314 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001316 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1317 && (iter+1) < end
1318 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001320 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 iter += 2;
1323 }
1324 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001326 {
1327 ch = *iter;
1328 iter++;
1329 }
1330 if (ch > *maxchar) {
1331 *maxchar = ch;
1332 if (*maxchar > MAX_UNICODE) {
1333 PyErr_Format(PyExc_ValueError,
1334 "character U+%x is not in range [U+0000; U+10ffff]",
1335 ch);
1336 return -1;
1337 }
1338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 }
1340 return 0;
1341}
1342
1343#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001344static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#endif
1346
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001347int
1348_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349{
1350 wchar_t *end;
1351 Py_UCS4 maxchar = 0;
1352 Py_ssize_t num_surrogates;
1353#if SIZEOF_WCHAR_T == 2
1354 Py_ssize_t length_wo_surrogates;
1355#endif
1356
Georg Brandl7597add2011-10-05 16:36:47 +02001357 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001358 strings were created using _PyObject_New() and where no canonical
1359 representation (the str field) has been set yet aka strings
1360 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001361 assert(_PyUnicode_CHECK(unicode));
1362 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001365 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 /* Actually, it should neither be interned nor be anything else: */
1367 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369#ifdef Py_DEBUG
1370 ++unicode_ready_calls;
1371#endif
1372
1373 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001374 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001375 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377
1378 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001379 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1380 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 PyErr_NoMemory();
1382 return -1;
1383 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001384 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 _PyUnicode_WSTR(unicode), end,
1386 PyUnicode_1BYTE_DATA(unicode));
1387 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1388 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1390 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001393 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001396 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001397 _PyUnicode_UTF8(unicode) = NULL;
1398 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 }
1400 PyObject_FREE(_PyUnicode_WSTR(unicode));
1401 _PyUnicode_WSTR(unicode) = NULL;
1402 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1403 }
1404 /* In this case we might have to convert down from 4-byte native
1405 wchar_t to 2-byte unicode. */
1406 else if (maxchar < 65536) {
1407 assert(num_surrogates == 0 &&
1408 "FindMaxCharAndNumSurrogatePairs() messed up");
1409
Victor Stinner506f5922011-09-28 22:34:18 +02001410#if SIZEOF_WCHAR_T == 2
1411 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001413 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1414 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1415 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001418#else
1419 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001420 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001421 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001423 PyErr_NoMemory();
1424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 }
Victor Stinner506f5922011-09-28 22:34:18 +02001426 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1427 _PyUnicode_WSTR(unicode), end,
1428 PyUnicode_2BYTE_DATA(unicode));
1429 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1430 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1431 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8(unicode) = NULL;
1433 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyObject_FREE(_PyUnicode_WSTR(unicode));
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1437#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1440 else {
1441#if SIZEOF_WCHAR_T == 2
1442 /* in case the native representation is 2-bytes, we need to allocate a
1443 new normalized 4-byte version. */
1444 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1446 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 PyErr_NoMemory();
1448 return -1;
1449 }
1450 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001454 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1455 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001456 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#else
1461 assert(num_surrogates == 0);
1462
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 _PyUnicode_UTF8(unicode) = NULL;
1466 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1468#endif
1469 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1470 }
1471 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001472 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 return 0;
1474}
1475
Alexander Belopolsky40018472011-02-26 01:02:56 +00001476static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001477unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478{
Walter Dörwald16807132007-05-25 13:52:07 +00001479 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001480 case SSTATE_NOT_INTERNED:
1481 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001482
Benjamin Peterson29060642009-01-31 22:14:21 +00001483 case SSTATE_INTERNED_MORTAL:
1484 /* revive dead object temporarily for DelItem */
1485 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001486 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001487 Py_FatalError(
1488 "deletion of interned string failed");
1489 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001490
Benjamin Peterson29060642009-01-31 22:14:21 +00001491 case SSTATE_INTERNED_IMMORTAL:
1492 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001493
Benjamin Peterson29060642009-01-31 22:14:21 +00001494 default:
1495 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001496 }
1497
Victor Stinner03490912011-10-03 23:45:12 +02001498 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001500 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001501 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001502 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001505 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinner488fa492011-12-12 00:01:39 +01001526unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinner488fa492011-12-12 00:01:39 +01001528 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 if (Py_REFCNT(unicode) != 1)
1530 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001531 if (_PyUnicode_HASH(unicode) != -1)
1532 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001533 if (PyUnicode_CHECK_INTERNED(unicode))
1534 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001535 if (!PyUnicode_CheckExact(unicode))
1536 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001537#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001538 /* singleton refcount is greater than 1 */
1539 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001540#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 1;
1542}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001543
Victor Stinnerfe226c02011-10-03 03:52:20 +02001544static int
1545unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1546{
1547 PyObject *unicode;
1548 Py_ssize_t old_length;
1549
1550 assert(p_unicode != NULL);
1551 unicode = *p_unicode;
1552
1553 assert(unicode != NULL);
1554 assert(PyUnicode_Check(unicode));
1555 assert(0 <= length);
1556
Victor Stinner910337b2011-10-03 03:20:16 +02001557 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001558 old_length = PyUnicode_WSTR_LENGTH(unicode);
1559 else
1560 old_length = PyUnicode_GET_LENGTH(unicode);
1561 if (old_length == length)
1562 return 0;
1563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001564 if (length == 0) {
1565 Py_DECREF(*p_unicode);
1566 *p_unicode = unicode_empty;
1567 Py_INCREF(*p_unicode);
1568 return 0;
1569 }
1570
Victor Stinner488fa492011-12-12 00:01:39 +01001571 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 PyObject *copy = resize_copy(unicode, length);
1573 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 Py_DECREF(*p_unicode);
1576 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001578 }
1579
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001581 PyObject *new_unicode = resize_compact(unicode, length);
1582 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001584 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001585 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001587 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001588 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589}
1590
Alexander Belopolsky40018472011-02-26 01:02:56 +00001591int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001594 PyObject *unicode;
1595 if (p_unicode == NULL) {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 {
1602 PyErr_BadInternalCall();
1603 return -1;
1604 }
1605 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001606}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001609unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001610{
1611 PyObject *result;
1612 assert(PyUnicode_IS_READY(*p_unicode));
1613 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1614 return 0;
1615 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1616 maxchar);
1617 if (result == NULL)
1618 return -1;
1619 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1620 PyUnicode_GET_LENGTH(*p_unicode));
1621 Py_DECREF(*p_unicode);
1622 *p_unicode = result;
1623 return 0;
1624}
1625
1626static int
1627unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1628 Py_UCS4 ch)
1629{
1630 if (unicode_widen(p_unicode, ch) < 0)
1631 return -1;
1632 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1633 PyUnicode_DATA(*p_unicode),
1634 (*pos)++, ch);
1635 return 0;
1636}
1637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638static PyObject*
1639get_latin1_char(unsigned char ch)
1640{
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001643 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001644 if (!unicode)
1645 return NULL;
1646 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001647 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 unicode_latin1[ch] = unicode;
1649 }
1650 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001651 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652}
1653
Alexander Belopolsky40018472011-02-26 01:02:56 +00001654PyObject *
1655PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001657 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 Py_UCS4 maxchar = 0;
1659 Py_ssize_t num_surrogates;
1660
1661 if (u == NULL)
1662 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664 /* If the Unicode data is known at construction time, we can apply
1665 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Optimization for empty strings */
1668 if (size == 0 && unicode_empty != NULL) {
1669 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001670 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671 }
Tim Petersced69f82003-09-16 20:30:58 +00001672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001673 /* Single character Unicode objects in the Latin-1 range are
1674 shared when using this constructor */
1675 if (size == 1 && *u < 256)
1676 return get_latin1_char((unsigned char)*u);
1677
1678 /* If not empty and not single character, copy the Unicode data
1679 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001680 if (find_maxchar_surrogates(u, u + size,
1681 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 return NULL;
1683
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 if (!unicode)
1686 return NULL;
1687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 switch (PyUnicode_KIND(unicode)) {
1689 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001690 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1692 break;
1693 case PyUnicode_2BYTE_KIND:
1694#if Py_UNICODE_SIZE == 2
1695 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1696#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001697 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001698 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1699#endif
1700 break;
1701 case PyUnicode_4BYTE_KIND:
1702#if SIZEOF_WCHAR_T == 2
1703 /* This is the only case which has to process surrogates, thus
1704 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001705 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706#else
1707 assert(num_surrogates == 0);
1708 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1709#endif
1710 break;
1711 default:
1712 assert(0 && "Impossible state");
1713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716}
1717
Alexander Belopolsky40018472011-02-26 01:02:56 +00001718PyObject *
1719PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001721 if (size < 0) {
1722 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001724 return NULL;
1725 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001726 if (u != NULL)
1727 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1728 else
1729 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730}
1731
Alexander Belopolsky40018472011-02-26 01:02:56 +00001732PyObject *
1733PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001734{
1735 size_t size = strlen(u);
1736 if (size > PY_SSIZE_T_MAX) {
1737 PyErr_SetString(PyExc_OverflowError, "input too long");
1738 return NULL;
1739 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001740 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001741}
1742
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001743PyObject *
1744_PyUnicode_FromId(_Py_Identifier *id)
1745{
1746 if (!id->object) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001747 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1748 strlen(id->string),
1749 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750 if (!id->object)
1751 return NULL;
1752 PyUnicode_InternInPlace(&id->object);
1753 assert(!id->next);
1754 id->next = static_strings;
1755 static_strings = id;
1756 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001757 return id->object;
1758}
1759
1760void
1761_PyUnicode_ClearStaticStrings()
1762{
1763 _Py_Identifier *i;
1764 for (i = static_strings; i; i = i->next) {
1765 Py_DECREF(i->object);
1766 i->object = NULL;
1767 i->next = NULL;
1768 }
1769}
1770
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001771/* Internal function, don't check maximum character */
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001774unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001775{
Victor Stinner785938e2011-12-11 20:09:03 +01001776 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001777 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001778#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001779 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001780#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001781 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001782 }
Victor Stinner785938e2011-12-11 20:09:03 +01001783 unicode = PyUnicode_New(size, 127);
1784 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001785 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001786 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1787 assert(_PyUnicode_CheckConsistency(unicode, 1));
1788 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001789}
1790
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001791static Py_UCS4
1792kind_maxchar_limit(unsigned int kind)
1793{
1794 switch(kind) {
1795 case PyUnicode_1BYTE_KIND:
1796 return 0x80;
1797 case PyUnicode_2BYTE_KIND:
1798 return 0x100;
1799 case PyUnicode_4BYTE_KIND:
1800 return 0x10000;
1801 default:
1802 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001803 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001804 }
1805}
1806
Victor Stinner702c7342011-10-05 13:50:52 +02001807static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001808_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001810 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001811 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001812
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001813 if (size == 0) {
1814 Py_INCREF(unicode_empty);
1815 return unicode_empty;
1816 }
1817 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001818 if (size == 1)
1819 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001821 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 if (!res)
1824 return NULL;
1825 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001826 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001828}
1829
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830static PyObject*
1831_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832{
1833 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001836 if (size == 0) {
1837 Py_INCREF(unicode_empty);
1838 return unicode_empty;
1839 }
1840 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001841 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001842 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001844 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001845 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 if (!res)
1847 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001848 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 else {
1851 _PyUnicode_CONVERT_BYTES(
1852 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1853 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001854 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 return res;
1856}
1857
Victor Stinnere57b1c02011-09-28 22:20:48 +02001858static PyObject*
1859_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860{
1861 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 if (size == 0) {
1865 Py_INCREF(unicode_empty);
1866 return unicode_empty;
1867 }
1868 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001869 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001870 return get_latin1_char((unsigned char)u[0]);
1871
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001873 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 if (!res)
1875 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001876 if (max_char < 256)
1877 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1878 PyUnicode_1BYTE_DATA(res));
1879 else if (max_char < 0x10000)
1880 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1881 PyUnicode_2BYTE_DATA(res));
1882 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001884 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 return res;
1886}
1887
1888PyObject*
1889PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1890{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001891 if (size < 0) {
1892 PyErr_SetString(PyExc_ValueError, "size must be positive");
1893 return NULL;
1894 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 switch(kind) {
1896 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001897 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001898 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001899 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001900 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001901 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 PyErr_SetString(PyExc_SystemError, "invalid kind");
1904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906}
1907
Victor Stinner25a4b292011-10-06 12:31:55 +02001908/* Ensure that a string uses the most efficient storage, if it is not the
1909 case: create a new string with of the right kind. Write NULL into *p_unicode
1910 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001911static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001912unicode_adjust_maxchar(PyObject **p_unicode)
1913{
1914 PyObject *unicode, *copy;
1915 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001916 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001917 unsigned int kind;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921 assert(PyUnicode_IS_READY(unicode));
1922 if (PyUnicode_IS_ASCII(unicode))
1923 return;
1924
1925 len = PyUnicode_GET_LENGTH(unicode);
1926 kind = PyUnicode_KIND(unicode);
1927 if (kind == PyUnicode_1BYTE_KIND) {
1928 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001929 max_char = ucs1lib_find_max_char(u, u + len);
1930 if (max_char >= 128)
1931 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 }
1933 else if (kind == PyUnicode_2BYTE_KIND) {
1934 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + len);
1936 if (max_char >= 256)
1937 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001938 }
1939 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs4lib_find_max_char(u, u + len);
1943 if (max_char >= 0x10000)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 copy = PyUnicode_New(len, max_char);
1947 copy_characters(copy, 0, unicode, 0, len);
1948 Py_DECREF(unicode);
1949 *p_unicode = copy;
1950}
1951
Victor Stinner034f6cf2011-09-30 02:26:44 +02001952PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01001953_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02001954{
Victor Stinner87af4f22011-11-21 23:03:47 +01001955 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001956 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001957
Victor Stinner034f6cf2011-09-30 02:26:44 +02001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadInternalCall();
1960 return NULL;
1961 }
1962 if (PyUnicode_READY(unicode))
1963 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner87af4f22011-11-21 23:03:47 +01001965 length = PyUnicode_GET_LENGTH(unicode);
1966 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001967 if (!copy)
1968 return NULL;
1969 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1970
Victor Stinner87af4f22011-11-21 23:03:47 +01001971 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1972 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001973 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001975}
1976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978/* Widen Unicode objects to larger buffers. Don't write terminating null
1979 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980
1981void*
1982_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1983{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001984 Py_ssize_t len;
1985 void *result;
1986 unsigned int skind;
1987
1988 if (PyUnicode_READY(s))
1989 return NULL;
1990
1991 len = PyUnicode_GET_LENGTH(s);
1992 skind = PyUnicode_KIND(s);
1993 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001994 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995 return NULL;
1996 }
1997 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 case PyUnicode_2BYTE_KIND:
1999 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2000 if (!result)
2001 return PyErr_NoMemory();
2002 assert(skind == PyUnicode_1BYTE_KIND);
2003 _PyUnicode_CONVERT_BYTES(
2004 Py_UCS1, Py_UCS2,
2005 PyUnicode_1BYTE_DATA(s),
2006 PyUnicode_1BYTE_DATA(s) + len,
2007 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002009 case PyUnicode_4BYTE_KIND:
2010 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2011 if (!result)
2012 return PyErr_NoMemory();
2013 if (skind == PyUnicode_2BYTE_KIND) {
2014 _PyUnicode_CONVERT_BYTES(
2015 Py_UCS2, Py_UCS4,
2016 PyUnicode_2BYTE_DATA(s),
2017 PyUnicode_2BYTE_DATA(s) + len,
2018 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002020 else {
2021 assert(skind == PyUnicode_1BYTE_KIND);
2022 _PyUnicode_CONVERT_BYTES(
2023 Py_UCS1, Py_UCS4,
2024 PyUnicode_1BYTE_DATA(s),
2025 PyUnicode_1BYTE_DATA(s) + len,
2026 result);
2027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 default:
2030 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 }
Victor Stinner01698042011-10-04 00:04:26 +02002032 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 return NULL;
2034}
2035
2036static Py_UCS4*
2037as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2038 int copy_null)
2039{
2040 int kind;
2041 void *data;
2042 Py_ssize_t len, targetlen;
2043 if (PyUnicode_READY(string) == -1)
2044 return NULL;
2045 kind = PyUnicode_KIND(string);
2046 data = PyUnicode_DATA(string);
2047 len = PyUnicode_GET_LENGTH(string);
2048 targetlen = len;
2049 if (copy_null)
2050 targetlen++;
2051 if (!target) {
2052 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2053 PyErr_NoMemory();
2054 return NULL;
2055 }
2056 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2057 if (!target) {
2058 PyErr_NoMemory();
2059 return NULL;
2060 }
2061 }
2062 else {
2063 if (targetsize < targetlen) {
2064 PyErr_Format(PyExc_SystemError,
2065 "string is longer than the buffer");
2066 if (copy_null && 0 < targetsize)
2067 target[0] = 0;
2068 return NULL;
2069 }
2070 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002071 if (kind == PyUnicode_1BYTE_KIND) {
2072 Py_UCS1 *start = (Py_UCS1 *) data;
2073 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002075 else if (kind == PyUnicode_2BYTE_KIND) {
2076 Py_UCS2 *start = (Py_UCS2 *) data;
2077 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2078 }
2079 else {
2080 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 if (copy_null)
2084 target[len] = 0;
2085 return target;
2086}
2087
2088Py_UCS4*
2089PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2090 int copy_null)
2091{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002092 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 PyErr_BadInternalCall();
2094 return NULL;
2095 }
2096 return as_ucs4(string, target, targetsize, copy_null);
2097}
2098
2099Py_UCS4*
2100PyUnicode_AsUCS4Copy(PyObject *string)
2101{
2102 return as_ucs4(string, NULL, 0, 1);
2103}
2104
2105#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002106
Alexander Belopolsky40018472011-02-26 01:02:56 +00002107PyObject *
2108PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002110 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002111 if (size == 0) {
2112 Py_INCREF(unicode_empty);
2113 return unicode_empty;
2114 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002115 PyErr_BadInternalCall();
2116 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
Martin v. Löwis790465f2008-04-05 20:41:37 +00002119 if (size == -1) {
2120 size = wcslen(w);
2121 }
2122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002123 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124}
2125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002127
Walter Dörwald346737f2007-05-31 10:44:43 +00002128static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002129makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2130 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002132 *fmt++ = '%';
2133 if (width) {
2134 if (zeropad)
2135 *fmt++ = '0';
2136 fmt += sprintf(fmt, "%d", width);
2137 }
2138 if (precision)
2139 fmt += sprintf(fmt, ".%d", precision);
2140 if (longflag)
2141 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142 else if (longlongflag) {
2143 /* longlongflag should only ever be nonzero on machines with
2144 HAVE_LONG_LONG defined */
2145#ifdef HAVE_LONG_LONG
2146 char *f = PY_FORMAT_LONG_LONG;
2147 while (*f)
2148 *fmt++ = *f++;
2149#else
2150 /* we shouldn't ever get here */
2151 assert(0);
2152 *fmt++ = 'l';
2153#endif
2154 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 else if (size_tflag) {
2156 char *f = PY_FORMAT_SIZE_T;
2157 while (*f)
2158 *fmt++ = *f++;
2159 }
2160 *fmt++ = c;
2161 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002162}
2163
Victor Stinner96865452011-03-01 23:44:09 +00002164/* helper for PyUnicode_FromFormatV() */
2165
2166static const char*
2167parse_format_flags(const char *f,
2168 int *p_width, int *p_precision,
2169 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2170{
2171 int width, precision, longflag, longlongflag, size_tflag;
2172
2173 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2174 f++;
2175 width = 0;
2176 while (Py_ISDIGIT((unsigned)*f))
2177 width = (width*10) + *f++ - '0';
2178 precision = 0;
2179 if (*f == '.') {
2180 f++;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 precision = (precision*10) + *f++ - '0';
2183 if (*f == '%') {
2184 /* "%.3%s" => f points to "3" */
2185 f--;
2186 }
2187 }
2188 if (*f == '\0') {
2189 /* bogus format "%.1" => go backward, f points to "1" */
2190 f--;
2191 }
2192 if (p_width != NULL)
2193 *p_width = width;
2194 if (p_precision != NULL)
2195 *p_precision = precision;
2196
2197 /* Handle %ld, %lu, %lld and %llu. */
2198 longflag = 0;
2199 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002200 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002201
2202 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002203 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002204 longflag = 1;
2205 ++f;
2206 }
2207#ifdef HAVE_LONG_LONG
2208 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002209 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002210 longlongflag = 1;
2211 f += 2;
2212 }
2213#endif
2214 }
2215 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 size_tflag = 1;
2218 ++f;
2219 }
2220 if (p_longflag != NULL)
2221 *p_longflag = longflag;
2222 if (p_longlongflag != NULL)
2223 *p_longlongflag = longlongflag;
2224 if (p_size_tflag != NULL)
2225 *p_size_tflag = size_tflag;
2226 return f;
2227}
2228
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002229/* maximum number of characters required for output of %ld. 21 characters
2230 allows for 64-bit integers (in decimal) and an optional sign. */
2231#define MAX_LONG_CHARS 21
2232/* maximum number of characters required for output of %lld.
2233 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2234 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2235#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormatV(const char *format, va_list vargs)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 va_list count;
2241 Py_ssize_t callcount = 0;
2242 PyObject **callresults = NULL;
2243 PyObject **callresult = NULL;
2244 Py_ssize_t n = 0;
2245 int width = 0;
2246 int precision = 0;
2247 int zeropad;
2248 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002249 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002251 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2253 Py_UCS4 argmaxchar;
2254 Py_ssize_t numbersize = 0;
2255 char *numberresults = NULL;
2256 char *numberresult = NULL;
2257 Py_ssize_t i;
2258 int kind;
2259 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002260
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002261 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002262 /* step 1: count the number of %S/%R/%A/%s format specifications
2263 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2264 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002266 * also estimate a upper bound for all the number formats in the string,
2267 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 for (f = format; *f; f++) {
2270 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002271 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2273 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2274 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2275 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002278#ifdef HAVE_LONG_LONG
2279 if (longlongflag) {
2280 if (width < MAX_LONG_LONG_CHARS)
2281 width = MAX_LONG_LONG_CHARS;
2282 }
2283 else
2284#endif
2285 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2286 including sign. Decimal takes the most space. This
2287 isn't enough for octal. If a width is specified we
2288 need more (which we allocate later). */
2289 if (width < MAX_LONG_CHARS)
2290 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291
2292 /* account for the size + '\0' to separate numbers
2293 inside of the numberresults buffer */
2294 numbersize += (width + 1);
2295 }
2296 }
2297 else if ((unsigned char)*f > 127) {
2298 PyErr_Format(PyExc_ValueError,
2299 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2300 "string, got a non-ASCII byte: 0x%02x",
2301 (unsigned char)*f);
2302 return NULL;
2303 }
2304 }
2305 /* step 2: allocate memory for the results of
2306 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2307 if (callcount) {
2308 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2309 if (!callresults) {
2310 PyErr_NoMemory();
2311 return NULL;
2312 }
2313 callresult = callresults;
2314 }
2315 /* step 2.5: allocate memory for the results of formating numbers */
2316 if (numbersize) {
2317 numberresults = PyObject_Malloc(numbersize);
2318 if (!numberresults) {
2319 PyErr_NoMemory();
2320 goto fail;
2321 }
2322 numberresult = numberresults;
2323 }
2324
2325 /* step 3: format numbers and figure out how large a buffer we need */
2326 for (f = format; *f; f++) {
2327 if (*f == '%') {
2328 const char* p;
2329 int longflag;
2330 int longlongflag;
2331 int size_tflag;
2332 int numprinted;
2333
2334 p = f;
2335 zeropad = (f[1] == '0');
2336 f = parse_format_flags(f, &width, &precision,
2337 &longflag, &longlongflag, &size_tflag);
2338 switch (*f) {
2339 case 'c':
2340 {
2341 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002342 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 n++;
2344 break;
2345 }
2346 case '%':
2347 n++;
2348 break;
2349 case 'i':
2350 case 'd':
2351 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2352 width, precision, *f);
2353 if (longflag)
2354 numprinted = sprintf(numberresult, fmt,
2355 va_arg(count, long));
2356#ifdef HAVE_LONG_LONG
2357 else if (longlongflag)
2358 numprinted = sprintf(numberresult, fmt,
2359 va_arg(count, PY_LONG_LONG));
2360#endif
2361 else if (size_tflag)
2362 numprinted = sprintf(numberresult, fmt,
2363 va_arg(count, Py_ssize_t));
2364 else
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, int));
2367 n += numprinted;
2368 /* advance by +1 to skip over the '\0' */
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
2374 break;
2375 case 'u':
2376 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2377 width, precision, 'u');
2378 if (longflag)
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, unsigned long));
2381#ifdef HAVE_LONG_LONG
2382 else if (longlongflag)
2383 numprinted = sprintf(numberresult, fmt,
2384 va_arg(count, unsigned PY_LONG_LONG));
2385#endif
2386 else if (size_tflag)
2387 numprinted = sprintf(numberresult, fmt,
2388 va_arg(count, size_t));
2389 else
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned int));
2392 n += numprinted;
2393 numberresult += (numprinted + 1);
2394 assert(*(numberresult - 1) == '\0');
2395 assert(*(numberresult - 2) != '\0');
2396 assert(numprinted >= 0);
2397 assert(numberresult <= numberresults + numbersize);
2398 break;
2399 case 'x':
2400 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2401 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2402 n += numprinted;
2403 numberresult += (numprinted + 1);
2404 assert(*(numberresult - 1) == '\0');
2405 assert(*(numberresult - 2) != '\0');
2406 assert(numprinted >= 0);
2407 assert(numberresult <= numberresults + numbersize);
2408 break;
2409 case 'p':
2410 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2411 /* %p is ill-defined: ensure leading 0x. */
2412 if (numberresult[1] == 'X')
2413 numberresult[1] = 'x';
2414 else if (numberresult[1] != 'x') {
2415 memmove(numberresult + 2, numberresult,
2416 strlen(numberresult) + 1);
2417 numberresult[0] = '0';
2418 numberresult[1] = 'x';
2419 numprinted += 2;
2420 }
2421 n += numprinted;
2422 numberresult += (numprinted + 1);
2423 assert(*(numberresult - 1) == '\0');
2424 assert(*(numberresult - 2) != '\0');
2425 assert(numprinted >= 0);
2426 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002427 break;
2428 case 's':
2429 {
2430 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002431 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002432 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002433 if (!str)
2434 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002435 /* since PyUnicode_DecodeUTF8 returns already flexible
2436 unicode objects, there is no need to call ready on them */
2437 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002438 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002440 /* Remember the str and switch to the next slot */
2441 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 break;
2443 }
2444 case 'U':
2445 {
2446 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002447 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 if (PyUnicode_READY(obj) == -1)
2449 goto fail;
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002453 break;
2454 }
2455 case 'V':
2456 {
2457 PyObject *obj = va_arg(count, PyObject *);
2458 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002459 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002462 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463 if (PyUnicode_READY(obj) == -1)
2464 goto fail;
2465 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002466 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002467 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002468 *callresult++ = NULL;
2469 }
2470 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002471 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 if (!str_obj)
2473 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002474 if (PyUnicode_READY(str_obj)) {
2475 Py_DECREF(str_obj);
2476 goto fail;
2477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = str_obj;
2482 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 break;
2484 }
2485 case 'S':
2486 {
2487 PyObject *obj = va_arg(count, PyObject *);
2488 PyObject *str;
2489 assert(obj);
2490 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002494 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 /* Remember the str and switch to the next slot */
2497 *callresult++ = str;
2498 break;
2499 }
2500 case 'R':
2501 {
2502 PyObject *obj = va_arg(count, PyObject *);
2503 PyObject *repr;
2504 assert(obj);
2505 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002507 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002509 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002510 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 /* Remember the repr and switch to the next slot */
2512 *callresult++ = repr;
2513 break;
2514 }
2515 case 'A':
2516 {
2517 PyObject *obj = va_arg(count, PyObject *);
2518 PyObject *ascii;
2519 assert(obj);
2520 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002524 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002526 /* Remember the repr and switch to the next slot */
2527 *callresult++ = ascii;
2528 break;
2529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002530 default:
2531 /* if we stumble upon an unknown
2532 formatting code, copy the rest of
2533 the format string to the output
2534 string. (we cannot just skip the
2535 code, since there's no way to know
2536 what's in the argument list) */
2537 n += strlen(p);
2538 goto expand;
2539 }
2540 } else
2541 n++;
2542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002543 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 we don't have to resize the string.
2547 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002548 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 if (!string)
2550 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 kind = PyUnicode_KIND(string);
2552 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002558 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002559
2560 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2562 /* checking for == because the last argument could be a empty
2563 string, which causes i to point to end, the assert at the end of
2564 the loop */
2565 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002566
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 switch (*f) {
2568 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002569 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 const int ordinal = va_arg(vargs, int);
2571 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002573 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002574 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002576 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002578 case 'p':
2579 /* unused, since we already have the result */
2580 if (*f == 'p')
2581 (void) va_arg(vargs, void *);
2582 else
2583 (void) va_arg(vargs, int);
2584 /* extract the result from numberresults and append. */
2585 for (; *numberresult; ++i, ++numberresult)
2586 PyUnicode_WRITE(kind, data, i, *numberresult);
2587 /* skip over the separating '\0' */
2588 assert(*numberresult == '\0');
2589 numberresult++;
2590 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 case 's':
2593 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002594 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002597 size = PyUnicode_GET_LENGTH(*callresult);
2598 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002599 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* We're done with the unicode()/repr() => forget it */
2602 Py_DECREF(*callresult);
2603 /* switch to next unicode()/repr() result */
2604 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'U':
2608 {
2609 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 Py_ssize_t size;
2611 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2612 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 break;
2616 }
2617 case 'V':
2618 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 size = PyUnicode_GET_LENGTH(obj);
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002625 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(*callresult);
2629 assert(PyUnicode_KIND(*callresult) <=
2630 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002631 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 break;
2637 }
2638 case 'S':
2639 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002640 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002642 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 /* unused, since we already have the result */
2644 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002646 copy_characters(string, i, *callresult, 0, size);
2647 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* We're done with the unicode()/repr() => forget it */
2649 Py_DECREF(*callresult);
2650 /* switch to next unicode()/repr() result */
2651 ++callresult;
2652 break;
2653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 break;
2657 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 for (; *p; ++p, ++i)
2659 PyUnicode_WRITE(kind, data, i, *p);
2660 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 goto end;
2662 }
Victor Stinner1205f272010-09-11 00:54:47 +00002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664 else {
2665 assert(i < PyUnicode_GET_LENGTH(string));
2666 PyUnicode_WRITE(kind, data, i++, *f);
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 if (callresults)
2673 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 if (numberresults)
2675 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002676 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 if (callresults) {
2679 PyObject **callresult2 = callresults;
2680 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002681 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 ++callresult2;
2683 }
2684 PyObject_Free(callresults);
2685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002686 if (numberresults)
2687 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689}
2690
Walter Dörwaldd2034312007-05-18 16:29:38 +00002691PyObject *
2692PyUnicode_FromFormat(const char *format, ...)
2693{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 PyObject* ret;
2695 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696
2697#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 ret = PyUnicode_FromFormatV(format, vargs);
2703 va_end(vargs);
2704 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707#ifdef HAVE_WCHAR_H
2708
Victor Stinner5593d8a2010-10-02 11:11:27 +00002709/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2710 convert a Unicode object to a wide character string.
2711
Victor Stinnerd88d9832011-09-06 02:00:05 +02002712 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 character) required to convert the unicode object. Ignore size argument.
2714
Victor Stinnerd88d9832011-09-06 02:00:05 +02002715 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002719unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002720 wchar_t *w,
2721 Py_ssize_t size)
2722{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 const wchar_t *wstr;
2725
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 if (wstr == NULL)
2728 return -1;
2729
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size > res)
2732 size = res + 1;
2733 else
2734 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 return res;
2737 }
2738 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002740}
2741
2742Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002743PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002744 wchar_t *w,
2745 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746{
2747 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002748 PyErr_BadInternalCall();
2749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002751 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752}
2753
Victor Stinner137c34c2010-09-29 10:25:54 +00002754wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002755PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 Py_ssize_t *size)
2757{
2758 wchar_t* buffer;
2759 Py_ssize_t buflen;
2760
2761 if (unicode == NULL) {
2762 PyErr_BadInternalCall();
2763 return NULL;
2764 }
2765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002766 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 if (buflen == -1)
2768 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 PyErr_NoMemory();
2771 return NULL;
2772 }
2773
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2775 if (buffer == NULL) {
2776 PyErr_NoMemory();
2777 return NULL;
2778 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size != NULL)
2783 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 return buffer;
2785}
2786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788
Alexander Belopolsky40018472011-02-26 01:02:56 +00002789PyObject *
2790PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002793 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002794 PyErr_SetString(PyExc_ValueError,
2795 "chr() arg not in range(0x110000)");
2796 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 if (ordinal < 256)
2800 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 v = PyUnicode_New(1, ordinal);
2803 if (v == NULL)
2804 return NULL;
2805 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002806 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808}
2809
Alexander Belopolsky40018472011-02-26 01:02:56 +00002810PyObject *
2811PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002815 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002816 if (PyUnicode_READY(obj))
2817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 Py_INCREF(obj);
2819 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 }
2821 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002822 /* For a Unicode subtype that's not a Unicode object,
2823 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002824 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002826 PyErr_Format(PyExc_TypeError,
2827 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002828 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002829 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830}
2831
Alexander Belopolsky40018472011-02-26 01:02:56 +00002832PyObject *
2833PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002834 const char *encoding,
2835 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002836{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002837 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002838 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002839
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002841 PyErr_BadInternalCall();
2842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002845 /* Decoding bytes objects is the most common case and should be fast */
2846 if (PyBytes_Check(obj)) {
2847 if (PyBytes_GET_SIZE(obj) == 0) {
2848 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002849 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 }
2851 else {
2852 v = PyUnicode_Decode(
2853 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2854 encoding, errors);
2855 }
2856 return v;
2857 }
2858
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 PyErr_SetString(PyExc_TypeError,
2861 "decoding str is not supported");
2862 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002863 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002865 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2866 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2867 PyErr_Format(PyExc_TypeError,
2868 "coercing to str: need bytes, bytearray "
2869 "or buffer-like object, %.80s found",
2870 Py_TYPE(obj)->tp_name);
2871 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002872 }
Tim Petersced69f82003-09-16 20:30:58 +00002873
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002875 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002876 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883}
2884
Victor Stinner600d3be2010-06-10 12:00:55 +00002885/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002886 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2887 1 on success. */
2888static int
2889normalize_encoding(const char *encoding,
2890 char *lower,
2891 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002893 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 char *l;
2895 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002896
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002897 if (encoding == NULL) {
2898 strcpy(lower, "utf-8");
2899 return 1;
2900 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002901 e = encoding;
2902 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002903 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002904 while (*e) {
2905 if (l == l_end)
2906 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002907 if (Py_ISUPPER(*e)) {
2908 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 }
2910 else if (*e == '_') {
2911 *l++ = '-';
2912 e++;
2913 }
2914 else {
2915 *l++ = *e++;
2916 }
2917 }
2918 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002919 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002920}
2921
Alexander Belopolsky40018472011-02-26 01:02:56 +00002922PyObject *
2923PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002924 Py_ssize_t size,
2925 const char *encoding,
2926 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002927{
2928 PyObject *buffer = NULL, *unicode;
2929 Py_buffer info;
2930 char lower[11]; /* Enough for any encoding shortcut */
2931
Fred Drakee4315f52000-05-09 19:53:39 +00002932 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002933 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002934 if ((strcmp(lower, "utf-8") == 0) ||
2935 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002936 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002937 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002938 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002939 (strcmp(lower, "iso-8859-1") == 0))
2940 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002941#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002942 else if (strcmp(lower, "mbcs") == 0)
2943 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002944#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002945 else if (strcmp(lower, "ascii") == 0)
2946 return PyUnicode_DecodeASCII(s, size, errors);
2947 else if (strcmp(lower, "utf-16") == 0)
2948 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2949 else if (strcmp(lower, "utf-32") == 0)
2950 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952
2953 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002954 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002955 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002956 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002957 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 if (buffer == NULL)
2959 goto onError;
2960 unicode = PyCodec_Decode(buffer, encoding, errors);
2961 if (unicode == NULL)
2962 goto onError;
2963 if (!PyUnicode_Check(unicode)) {
2964 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002965 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002966 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_DECREF(unicode);
2968 goto onError;
2969 }
2970 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002971 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002972
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_XDECREF(buffer);
2975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 const char *encoding,
2981 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002982{
2983 PyObject *v;
2984
2985 if (!PyUnicode_Check(unicode)) {
2986 PyErr_BadArgument();
2987 goto onError;
2988 }
2989
2990 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002992
2993 /* Decode via the codec registry */
2994 v = PyCodec_Decode(unicode, encoding, errors);
2995 if (v == NULL)
2996 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002997 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998
Benjamin Peterson29060642009-01-31 22:14:21 +00002999 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000 return NULL;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003017
3018 /* Decode via the codec registry */
3019 v = PyCodec_Decode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 if (!PyUnicode_Check(v)) {
3023 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003024 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025 Py_TYPE(v)->tp_name);
3026 Py_DECREF(v);
3027 goto onError;
3028 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003029 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 return NULL;
3033}
3034
Alexander Belopolsky40018472011-02-26 01:02:56 +00003035PyObject *
3036PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003037 Py_ssize_t size,
3038 const char *encoding,
3039 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040{
3041 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003042
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 unicode = PyUnicode_FromUnicode(s, size);
3044 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3047 Py_DECREF(unicode);
3048 return v;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 const char *encoding,
3054 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003055{
3056 PyObject *v;
3057
3058 if (!PyUnicode_Check(unicode)) {
3059 PyErr_BadArgument();
3060 goto onError;
3061 }
3062
3063 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003064 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003065
3066 /* Encode via the codec registry */
3067 v = PyCodec_Encode(unicode, encoding, errors);
3068 if (v == NULL)
3069 goto onError;
3070 return v;
3071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073 return NULL;
3074}
3075
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003076static size_t
3077wcstombs_errorpos(const wchar_t *wstr)
3078{
3079 size_t len;
3080#if SIZEOF_WCHAR_T == 2
3081 wchar_t buf[3];
3082#else
3083 wchar_t buf[2];
3084#endif
3085 char outbuf[MB_LEN_MAX];
3086 const wchar_t *start, *previous;
3087 int save_errno;
3088
3089 save_errno = errno;
3090#if SIZEOF_WCHAR_T == 2
3091 buf[2] = 0;
3092#else
3093 buf[1] = 0;
3094#endif
3095 start = wstr;
3096 while (*wstr != L'\0')
3097 {
3098 previous = wstr;
3099#if SIZEOF_WCHAR_T == 2
3100 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3101 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3102 {
3103 buf[0] = wstr[0];
3104 buf[1] = wstr[1];
3105 wstr += 2;
3106 }
3107 else {
3108 buf[0] = *wstr;
3109 buf[1] = 0;
3110 wstr++;
3111 }
3112#else
3113 buf[0] = *wstr;
3114 wstr++;
3115#endif
3116 len = wcstombs(outbuf, buf, sizeof(outbuf));
3117 if (len == (size_t)-1) {
3118 errno = save_errno;
3119 return previous - start;
3120 }
3121 }
3122
3123 /* failed to find the unencodable character */
3124 errno = save_errno;
3125 return 0;
3126}
3127
Victor Stinner1b579672011-12-17 05:47:23 +01003128static int
3129locale_error_handler(const char *errors, int *surrogateescape)
3130{
3131 if (errors == NULL) {
3132 *surrogateescape = 0;
3133 return 0;
3134 }
3135
3136 if (strcmp(errors, "strict") == 0) {
3137 *surrogateescape = 0;
3138 return 0;
3139 }
3140 if (strcmp(errors, "surrogateescape") == 0) {
3141 *surrogateescape = 1;
3142 return 0;
3143 }
3144 PyErr_Format(PyExc_ValueError,
3145 "only 'strict' and 'surrogateescape' error handlers "
3146 "are supported, not '%s'",
3147 errors);
3148 return -1;
3149}
3150
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003151PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003152PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003153{
3154 Py_ssize_t wlen, wlen2;
3155 wchar_t *wstr;
3156 PyObject *bytes = NULL;
3157 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003158 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 PyObject *exc;
3160 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003161 int surrogateescape;
3162
3163 if (locale_error_handler(errors, &surrogateescape) < 0)
3164 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165
3166 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3167 if (wstr == NULL)
3168 return NULL;
3169
3170 wlen2 = wcslen(wstr);
3171 if (wlen2 != wlen) {
3172 PyMem_Free(wstr);
3173 PyErr_SetString(PyExc_TypeError, "embedded null character");
3174 return NULL;
3175 }
3176
3177 if (surrogateescape) {
3178 /* locale encoding with surrogateescape */
3179 char *str;
3180
3181 str = _Py_wchar2char(wstr, &error_pos);
3182 if (str == NULL) {
3183 if (error_pos == (size_t)-1) {
3184 PyErr_NoMemory();
3185 PyMem_Free(wstr);
3186 return NULL;
3187 }
3188 else {
3189 goto encode_error;
3190 }
3191 }
3192 PyMem_Free(wstr);
3193
3194 bytes = PyBytes_FromString(str);
3195 PyMem_Free(str);
3196 }
3197 else {
3198 size_t len, len2;
3199
3200 len = wcstombs(NULL, wstr, 0);
3201 if (len == (size_t)-1) {
3202 error_pos = wcstombs_errorpos(wstr);
3203 goto encode_error;
3204 }
3205
3206 bytes = PyBytes_FromStringAndSize(NULL, len);
3207 if (bytes == NULL) {
3208 PyMem_Free(wstr);
3209 return NULL;
3210 }
3211
3212 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3213 if (len2 == (size_t)-1 || len2 > len) {
3214 error_pos = wcstombs_errorpos(wstr);
3215 goto encode_error;
3216 }
3217 PyMem_Free(wstr);
3218 }
3219 return bytes;
3220
3221encode_error:
3222 errmsg = strerror(errno);
3223 assert(errmsg != NULL);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003224 PyMem_Free(wstr);
3225 Py_XDECREF(bytes);
3226
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003227 if (errmsg != NULL)
Victor Stinner1b579672011-12-17 05:47:23 +01003228 reason = PyUnicode_DecodeLocale(errmsg, "surrogateescape");
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003229 else
3230 reason = PyUnicode_FromString(
3231 "wcstombs() encountered an unencodable "
3232 "wide character");
3233 if (reason == NULL)
3234 return NULL;
3235
3236 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3237 "locale", unicode,
3238 (Py_ssize_t)error_pos,
3239 (Py_ssize_t)(error_pos+1),
3240 reason);
3241 Py_DECREF(reason);
3242 if (exc != NULL) {
3243 PyCodec_StrictErrors(exc);
3244 Py_XDECREF(exc);
3245 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003246 return NULL;
3247}
3248
Victor Stinnerad158722010-10-27 00:25:46 +00003249PyObject *
3250PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003251{
Victor Stinner99b95382011-07-04 14:23:54 +02003252#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003253 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003254#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003255 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003256#else
Victor Stinner793b5312011-04-27 00:24:21 +02003257 PyInterpreterState *interp = PyThreadState_GET()->interp;
3258 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3259 cannot use it to encode and decode filenames before it is loaded. Load
3260 the Python codec requires to encode at least its own filename. Use the C
3261 version of the locale codec until the codec registry is initialized and
3262 the Python codec is loaded.
3263
3264 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3265 cannot only rely on it: check also interp->fscodec_initialized for
3266 subinterpreters. */
3267 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003268 return PyUnicode_AsEncodedString(unicode,
3269 Py_FileSystemDefaultEncoding,
3270 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003271 }
3272 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003273 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003274 }
Victor Stinnerad158722010-10-27 00:25:46 +00003275#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003276}
3277
Alexander Belopolsky40018472011-02-26 01:02:56 +00003278PyObject *
3279PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003280 const char *encoding,
3281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282{
3283 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003284 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003285
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (!PyUnicode_Check(unicode)) {
3287 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289 }
Fred Drakee4315f52000-05-09 19:53:39 +00003290
Fred Drakee4315f52000-05-09 19:53:39 +00003291 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003292 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003293 if ((strcmp(lower, "utf-8") == 0) ||
3294 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003295 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003296 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003297 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003298 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003299 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003300 }
Victor Stinner37296e82010-06-10 13:36:23 +00003301 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003302 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003303 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003305#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003306 else if (strcmp(lower, "mbcs") == 0)
3307 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003308#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003309 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312
3313 /* Encode via the codec registry */
3314 v = PyCodec_Encode(unicode, encoding, errors);
3315 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003316 return NULL;
3317
3318 /* The normal path */
3319 if (PyBytes_Check(v))
3320 return v;
3321
3322 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003323 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003324 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003325 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003326
3327 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3328 "encoder %s returned bytearray instead of bytes",
3329 encoding);
3330 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003331 Py_DECREF(v);
3332 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003333 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003334
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003335 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3336 Py_DECREF(v);
3337 return b;
3338 }
3339
3340 PyErr_Format(PyExc_TypeError,
3341 "encoder did not return a bytes object (type=%.400s)",
3342 Py_TYPE(v)->tp_name);
3343 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003344 return NULL;
3345}
3346
Alexander Belopolsky40018472011-02-26 01:02:56 +00003347PyObject *
3348PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003349 const char *encoding,
3350 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003351{
3352 PyObject *v;
3353
3354 if (!PyUnicode_Check(unicode)) {
3355 PyErr_BadArgument();
3356 goto onError;
3357 }
3358
3359 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003361
3362 /* Encode via the codec registry */
3363 v = PyCodec_Encode(unicode, encoding, errors);
3364 if (v == NULL)
3365 goto onError;
3366 if (!PyUnicode_Check(v)) {
3367 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003368 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003369 Py_TYPE(v)->tp_name);
3370 Py_DECREF(v);
3371 goto onError;
3372 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003374
Benjamin Peterson29060642009-01-31 22:14:21 +00003375 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 return NULL;
3377}
3378
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003379PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003380PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003381 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003382{
3383 wchar_t smallbuf[256];
3384 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3385 wchar_t *wstr;
3386 size_t wlen, wlen2;
3387 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003388 int surrogateescape;
3389
3390 if (locale_error_handler(errors, &surrogateescape) < 0)
3391 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003392
3393 if (str[len] != '\0' || len != strlen(str)) {
3394 PyErr_SetString(PyExc_TypeError, "embedded null character");
3395 return NULL;
3396 }
3397
3398 if (surrogateescape)
3399 {
3400 wstr = _Py_char2wchar(str, &wlen);
3401 if (wstr == NULL) {
3402 if (wlen == (size_t)-1)
3403 PyErr_NoMemory();
3404 else
3405 PyErr_SetFromErrno(PyExc_OSError);
3406 return NULL;
3407 }
3408
3409 unicode = PyUnicode_FromWideChar(wstr, wlen);
3410 PyMem_Free(wstr);
3411 }
3412 else {
3413#ifndef HAVE_BROKEN_MBSTOWCS
3414 wlen = mbstowcs(NULL, str, 0);
3415#else
3416 wlen = len;
3417#endif
3418 if (wlen == (size_t)-1) {
3419 PyErr_SetFromErrno(PyExc_OSError);
3420 return NULL;
3421 }
3422 if (wlen+1 <= smallbuf_len) {
3423 wstr = smallbuf;
3424 }
3425 else {
3426 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3427 return PyErr_NoMemory();
3428
3429 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3430 if (!wstr)
3431 return PyErr_NoMemory();
3432 }
3433
3434 /* This shouldn't fail now */
3435 wlen2 = mbstowcs(wstr, str, wlen+1);
3436 if (wlen2 == (size_t)-1) {
3437 if (wstr != smallbuf)
3438 PyMem_Free(wstr);
3439 PyErr_SetFromErrno(PyExc_OSError);
3440 return NULL;
3441 }
3442#ifdef HAVE_BROKEN_MBSTOWCS
3443 assert(wlen2 == wlen);
3444#endif
3445 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3446 if (wstr != smallbuf)
3447 PyMem_Free(wstr);
3448 }
3449 return unicode;
3450}
3451
3452PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003453PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003454{
3455 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003456 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003457}
3458
3459
3460PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003461PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003462 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003463 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3464}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003465
Christian Heimes5894ba72007-11-04 11:43:14 +00003466PyObject*
3467PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3468{
Victor Stinner99b95382011-07-04 14:23:54 +02003469#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003470 return PyUnicode_DecodeMBCS(s, size, NULL);
3471#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003472 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003473#else
Victor Stinner793b5312011-04-27 00:24:21 +02003474 PyInterpreterState *interp = PyThreadState_GET()->interp;
3475 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3476 cannot use it to encode and decode filenames before it is loaded. Load
3477 the Python codec requires to encode at least its own filename. Use the C
3478 version of the locale codec until the codec registry is initialized and
3479 the Python codec is loaded.
3480
3481 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3482 cannot only rely on it: check also interp->fscodec_initialized for
3483 subinterpreters. */
3484 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003485 return PyUnicode_Decode(s, size,
3486 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003487 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003488 }
3489 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003490 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003491 }
Victor Stinnerad158722010-10-27 00:25:46 +00003492#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003493}
3494
Martin v. Löwis011e8422009-05-05 04:43:17 +00003495
3496int
3497PyUnicode_FSConverter(PyObject* arg, void* addr)
3498{
3499 PyObject *output = NULL;
3500 Py_ssize_t size;
3501 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003502 if (arg == NULL) {
3503 Py_DECREF(*(PyObject**)addr);
3504 return 1;
3505 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003506 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003507 output = arg;
3508 Py_INCREF(output);
3509 }
3510 else {
3511 arg = PyUnicode_FromObject(arg);
3512 if (!arg)
3513 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003514 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003515 Py_DECREF(arg);
3516 if (!output)
3517 return 0;
3518 if (!PyBytes_Check(output)) {
3519 Py_DECREF(output);
3520 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3521 return 0;
3522 }
3523 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003524 size = PyBytes_GET_SIZE(output);
3525 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003526 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003527 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003528 Py_DECREF(output);
3529 return 0;
3530 }
3531 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003532 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003533}
3534
3535
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003536int
3537PyUnicode_FSDecoder(PyObject* arg, void* addr)
3538{
3539 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003540 if (arg == NULL) {
3541 Py_DECREF(*(PyObject**)addr);
3542 return 1;
3543 }
3544 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545 if (PyUnicode_READY(arg))
3546 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003547 output = arg;
3548 Py_INCREF(output);
3549 }
3550 else {
3551 arg = PyBytes_FromObject(arg);
3552 if (!arg)
3553 return 0;
3554 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3555 PyBytes_GET_SIZE(arg));
3556 Py_DECREF(arg);
3557 if (!output)
3558 return 0;
3559 if (!PyUnicode_Check(output)) {
3560 Py_DECREF(output);
3561 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3562 return 0;
3563 }
3564 }
Victor Stinner065836e2011-10-27 01:56:33 +02003565 if (PyUnicode_READY(output) < 0) {
3566 Py_DECREF(output);
3567 return 0;
3568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003570 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003571 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3572 Py_DECREF(output);
3573 return 0;
3574 }
3575 *(PyObject**)addr = output;
3576 return Py_CLEANUP_SUPPORTED;
3577}
3578
3579
Martin v. Löwis5b222132007-06-10 09:51:05 +00003580char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003581PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003582{
Christian Heimesf3863112007-11-22 07:46:41 +00003583 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003584
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003585 if (!PyUnicode_Check(unicode)) {
3586 PyErr_BadArgument();
3587 return NULL;
3588 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003589 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003590 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003592 if (PyUnicode_UTF8(unicode) == NULL) {
3593 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003594 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3595 if (bytes == NULL)
3596 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003597 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3598 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 Py_DECREF(bytes);
3600 return NULL;
3601 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003602 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3603 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3604 PyBytes_AS_STRING(bytes),
3605 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 Py_DECREF(bytes);
3607 }
3608
3609 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003610 *psize = PyUnicode_UTF8_LENGTH(unicode);
3611 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003612}
3613
3614char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003615PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003617 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3618}
3619
3620#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003621static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003622#endif
3623
3624
3625Py_UNICODE *
3626PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3627{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003628 const unsigned char *one_byte;
3629#if SIZEOF_WCHAR_T == 4
3630 const Py_UCS2 *two_bytes;
3631#else
3632 const Py_UCS4 *four_bytes;
3633 const Py_UCS4 *ucs4_end;
3634 Py_ssize_t num_surrogates;
3635#endif
3636 wchar_t *w;
3637 wchar_t *wchar_end;
3638
3639 if (!PyUnicode_Check(unicode)) {
3640 PyErr_BadArgument();
3641 return NULL;
3642 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003643 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003644 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003645 assert(_PyUnicode_KIND(unicode) != 0);
3646 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003647
3648#ifdef Py_DEBUG
3649 ++unicode_as_unicode_calls;
3650#endif
3651
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003652 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003653#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003654 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3655 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003656 num_surrogates = 0;
3657
3658 for (; four_bytes < ucs4_end; ++four_bytes) {
3659 if (*four_bytes > 0xFFFF)
3660 ++num_surrogates;
3661 }
3662
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003663 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3664 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3665 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003666 PyErr_NoMemory();
3667 return NULL;
3668 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003669 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003670
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003671 w = _PyUnicode_WSTR(unicode);
3672 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3673 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003674 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3675 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003676 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003678 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3679 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680 }
3681 else
3682 *w = *four_bytes;
3683
3684 if (w > wchar_end) {
3685 assert(0 && "Miscalculated string end");
3686 }
3687 }
3688 *w = 0;
3689#else
3690 /* sizeof(wchar_t) == 4 */
3691 Py_FatalError("Impossible unicode object state, wstr and str "
3692 "should share memory already.");
3693 return NULL;
3694#endif
3695 }
3696 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003697 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3698 (_PyUnicode_LENGTH(unicode) + 1));
3699 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700 PyErr_NoMemory();
3701 return NULL;
3702 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003703 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3704 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3705 w = _PyUnicode_WSTR(unicode);
3706 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003708 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3709 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710 for (; w < wchar_end; ++one_byte, ++w)
3711 *w = *one_byte;
3712 /* null-terminate the wstr */
3713 *w = 0;
3714 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003715 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003717 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718 for (; w < wchar_end; ++two_bytes, ++w)
3719 *w = *two_bytes;
3720 /* null-terminate the wstr */
3721 *w = 0;
3722#else
3723 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003724 PyObject_FREE(_PyUnicode_WSTR(unicode));
3725 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003726 Py_FatalError("Impossible unicode object state, wstr "
3727 "and str should share memory already.");
3728 return NULL;
3729#endif
3730 }
3731 else {
3732 assert(0 && "This should never happen.");
3733 }
3734 }
3735 }
3736 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003737 *size = PyUnicode_WSTR_LENGTH(unicode);
3738 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003739}
3740
Alexander Belopolsky40018472011-02-26 01:02:56 +00003741Py_UNICODE *
3742PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745}
3746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747
Alexander Belopolsky40018472011-02-26 01:02:56 +00003748Py_ssize_t
3749PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750{
3751 if (!PyUnicode_Check(unicode)) {
3752 PyErr_BadArgument();
3753 goto onError;
3754 }
3755 return PyUnicode_GET_SIZE(unicode);
3756
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 return -1;
3759}
3760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761Py_ssize_t
3762PyUnicode_GetLength(PyObject *unicode)
3763{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003764 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 PyErr_BadArgument();
3766 return -1;
3767 }
3768
3769 return PyUnicode_GET_LENGTH(unicode);
3770}
3771
3772Py_UCS4
3773PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3774{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003775 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3776 PyErr_BadArgument();
3777 return (Py_UCS4)-1;
3778 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003779 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003780 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 return (Py_UCS4)-1;
3782 }
3783 return PyUnicode_READ_CHAR(unicode, index);
3784}
3785
3786int
3787PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3788{
3789 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003790 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 return -1;
3792 }
Victor Stinner488fa492011-12-12 00:01:39 +01003793 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003794 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003795 PyErr_SetString(PyExc_IndexError, "string index out of range");
3796 return -1;
3797 }
Victor Stinner488fa492011-12-12 00:01:39 +01003798 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003799 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003800 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3801 index, ch);
3802 return 0;
3803}
3804
Alexander Belopolsky40018472011-02-26 01:02:56 +00003805const char *
3806PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003807{
Victor Stinner42cb4622010-09-01 19:39:01 +00003808 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003809}
3810
Victor Stinner554f3f02010-06-16 23:33:54 +00003811/* create or adjust a UnicodeDecodeError */
3812static void
3813make_decode_exception(PyObject **exceptionObject,
3814 const char *encoding,
3815 const char *input, Py_ssize_t length,
3816 Py_ssize_t startpos, Py_ssize_t endpos,
3817 const char *reason)
3818{
3819 if (*exceptionObject == NULL) {
3820 *exceptionObject = PyUnicodeDecodeError_Create(
3821 encoding, input, length, startpos, endpos, reason);
3822 }
3823 else {
3824 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3825 goto onError;
3826 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3827 goto onError;
3828 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3829 goto onError;
3830 }
3831 return;
3832
3833onError:
3834 Py_DECREF(*exceptionObject);
3835 *exceptionObject = NULL;
3836}
3837
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838/* error handling callback helper:
3839 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003840 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 and adjust various state variables.
3842 return 0 on success, -1 on error
3843*/
3844
Alexander Belopolsky40018472011-02-26 01:02:56 +00003845static int
3846unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003847 const char *encoding, const char *reason,
3848 const char **input, const char **inend, Py_ssize_t *startinpos,
3849 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003850 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003851{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003852 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853
3854 PyObject *restuple = NULL;
3855 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003856 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003857 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003858 Py_ssize_t requiredsize;
3859 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003860 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003861 int res = -1;
3862
Victor Stinner596a6c42011-11-09 00:02:18 +01003863 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3864 outsize = PyUnicode_GET_LENGTH(*output);
3865 else
3866 outsize = _PyUnicode_WSTR_LENGTH(*output);
3867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003868 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 *errorHandler = PyCodec_LookupError(errors);
3870 if (*errorHandler == NULL)
3871 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 }
3873
Victor Stinner554f3f02010-06-16 23:33:54 +00003874 make_decode_exception(exceptionObject,
3875 encoding,
3876 *input, *inend - *input,
3877 *startinpos, *endinpos,
3878 reason);
3879 if (*exceptionObject == NULL)
3880 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881
3882 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3883 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003884 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003886 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 }
3889 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003891 if (PyUnicode_READY(repunicode) < 0)
3892 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003893
3894 /* Copy back the bytes variables, which might have been modified by the
3895 callback */
3896 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3897 if (!inputobj)
3898 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003899 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003900 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003901 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003902 *input = PyBytes_AS_STRING(inputobj);
3903 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003904 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003905 /* we can DECREF safely, as the exception has another reference,
3906 so the object won't go away. */
3907 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003909 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003910 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003911 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003912 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3913 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915
Victor Stinner596a6c42011-11-09 00:02:18 +01003916 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3917 /* need more space? (at least enough for what we
3918 have+the replacement+the rest of the string (starting
3919 at the new input position), so we won't have to check space
3920 when there are no errors in the rest of the string) */
3921 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3922 requiredsize = *outpos + replen + insize-newpos;
3923 if (requiredsize > outsize) {
3924 if (requiredsize<2*outsize)
3925 requiredsize = 2*outsize;
3926 if (unicode_resize(output, requiredsize) < 0)
3927 goto onError;
3928 }
3929 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003931 copy_characters(*output, *outpos, repunicode, 0, replen);
3932 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003934 else {
3935 wchar_t *repwstr;
3936 Py_ssize_t repwlen;
3937 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3938 if (repwstr == NULL)
3939 goto onError;
3940 /* need more space? (at least enough for what we
3941 have+the replacement+the rest of the string (starting
3942 at the new input position), so we won't have to check space
3943 when there are no errors in the rest of the string) */
3944 requiredsize = *outpos + repwlen + insize-newpos;
3945 if (requiredsize > outsize) {
3946 if (requiredsize < 2*outsize)
3947 requiredsize = 2*outsize;
3948 if (unicode_resize(output, requiredsize) < 0)
3949 goto onError;
3950 }
3951 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3952 *outpos += repwlen;
3953 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003955 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 /* we made it! */
3958 res = 0;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 Py_XDECREF(restuple);
3962 return res;
3963}
3964
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003965/* --- UTF-7 Codec -------------------------------------------------------- */
3966
Antoine Pitrou244651a2009-05-04 18:56:13 +00003967/* See RFC2152 for details. We encode conservatively and decode liberally. */
3968
3969/* Three simple macros defining base-64. */
3970
3971/* Is c a base-64 character? */
3972
3973#define IS_BASE64(c) \
3974 (((c) >= 'A' && (c) <= 'Z') || \
3975 ((c) >= 'a' && (c) <= 'z') || \
3976 ((c) >= '0' && (c) <= '9') || \
3977 (c) == '+' || (c) == '/')
3978
3979/* given that c is a base-64 character, what is its base-64 value? */
3980
3981#define FROM_BASE64(c) \
3982 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3983 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3984 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3985 (c) == '+' ? 62 : 63)
3986
3987/* What is the base-64 character of the bottom 6 bits of n? */
3988
3989#define TO_BASE64(n) \
3990 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3991
3992/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3993 * decoded as itself. We are permissive on decoding; the only ASCII
3994 * byte not decoding to itself is the + which begins a base64
3995 * string. */
3996
3997#define DECODE_DIRECT(c) \
3998 ((c) <= 127 && (c) != '+')
3999
4000/* The UTF-7 encoder treats ASCII characters differently according to
4001 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4002 * the above). See RFC2152. This array identifies these different
4003 * sets:
4004 * 0 : "Set D"
4005 * alphanumeric and '(),-./:?
4006 * 1 : "Set O"
4007 * !"#$%&*;<=>@[]^_`{|}
4008 * 2 : "whitespace"
4009 * ht nl cr sp
4010 * 3 : special (must be base64 encoded)
4011 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4012 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004013
Tim Petersced69f82003-09-16 20:30:58 +00004014static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015char utf7_category[128] = {
4016/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4017 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4018/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4019 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4020/* sp ! " # $ % & ' ( ) * + , - . / */
4021 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4022/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4023 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4024/* @ A B C D E F G H I J K L M N O */
4025 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4026/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4027 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4028/* ` a b c d e f g h i j k l m n o */
4029 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4030/* p q r s t u v w x y z { | } ~ del */
4031 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004032};
4033
Antoine Pitrou244651a2009-05-04 18:56:13 +00004034/* ENCODE_DIRECT: this character should be encoded as itself. The
4035 * answer depends on whether we are encoding set O as itself, and also
4036 * on whether we are encoding whitespace as itself. RFC2152 makes it
4037 * clear that the answers to these questions vary between
4038 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004039
Antoine Pitrou244651a2009-05-04 18:56:13 +00004040#define ENCODE_DIRECT(c, directO, directWS) \
4041 ((c) < 128 && (c) > 0 && \
4042 ((utf7_category[(c)] == 0) || \
4043 (directWS && (utf7_category[(c)] == 2)) || \
4044 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046PyObject *
4047PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004048 Py_ssize_t size,
4049 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004050{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004051 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4052}
4053
Antoine Pitrou244651a2009-05-04 18:56:13 +00004054/* The decoder. The only state we preserve is our read position,
4055 * i.e. how many characters we have consumed. So if we end in the
4056 * middle of a shift sequence we have to back off the read position
4057 * and the output to the beginning of the sequence, otherwise we lose
4058 * all the shift state (seen bits, number of bits seen, high
4059 * surrogate). */
4060
Alexander Belopolsky40018472011-02-26 01:02:56 +00004061PyObject *
4062PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004063 Py_ssize_t size,
4064 const char *errors,
4065 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t startinpos;
4069 Py_ssize_t endinpos;
4070 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004071 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004072 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004073 const char *errmsg = "";
4074 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004075 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 unsigned int base64bits = 0;
4077 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004078 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 PyObject *errorHandler = NULL;
4080 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004081
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004082 /* Start off assuming it's all ASCII. Widen later as necessary. */
4083 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084 if (!unicode)
4085 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004086 if (size == 0) {
4087 if (consumed)
4088 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004089 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004090 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004092 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093 e = s + size;
4094
4095 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004096 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004098 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004099
Antoine Pitrou244651a2009-05-04 18:56:13 +00004100 if (inShift) { /* in a base-64 section */
4101 if (IS_BASE64(ch)) { /* consume a base-64 character */
4102 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4103 base64bits += 6;
4104 s++;
4105 if (base64bits >= 16) {
4106 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004107 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 base64bits -= 16;
4109 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4110 if (surrogate) {
4111 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004112 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4113 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004114 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4115 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004116 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004117 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004118 }
4119 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004120 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4121 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004122 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004123 }
4124 }
Victor Stinner551ac952011-11-29 22:58:13 +01004125 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004126 /* first surrogate */
4127 surrogate = outCh;
4128 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004129 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004130 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4131 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004132 }
4133 }
4134 }
4135 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004136 inShift = 0;
4137 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004139 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4140 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004141 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004142 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004143 if (base64bits > 0) { /* left-over bits */
4144 if (base64bits >= 6) {
4145 /* We've seen at least one base-64 character */
4146 errmsg = "partial character in shift sequence";
4147 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004148 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004149 else {
4150 /* Some bits remain; they should be zero */
4151 if (base64buffer != 0) {
4152 errmsg = "non-zero padding bits in shift sequence";
4153 goto utf7Error;
4154 }
4155 }
4156 }
4157 if (ch != '-') {
4158 /* '-' is absorbed; other terminating
4159 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004160 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4161 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004163 }
4164 }
4165 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004167 s++; /* consume '+' */
4168 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004170 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4171 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004172 }
4173 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004175 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004176 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177 }
4178 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004179 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004180 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4181 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004182 s++;
4183 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004184 else {
4185 startinpos = s-starts;
4186 s++;
4187 errmsg = "unexpected special character";
4188 goto utf7Error;
4189 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004190 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004191utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 endinpos = s-starts;
4193 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 errors, &errorHandler,
4195 "utf7", errmsg,
4196 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004197 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004199 }
4200
Antoine Pitrou244651a2009-05-04 18:56:13 +00004201 /* end of string */
4202
4203 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4204 /* if we're in an inconsistent state, that's an error */
4205 if (surrogate ||
4206 (base64bits >= 6) ||
4207 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004208 endinpos = size;
4209 if (unicode_decode_call_errorhandler(
4210 errors, &errorHandler,
4211 "utf7", "unterminated shift sequence",
4212 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004213 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004214 goto onError;
4215 if (s < e)
4216 goto restart;
4217 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004218 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219
4220 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004221 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004222 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004223 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004224 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 }
4226 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004227 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004228 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004229 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004230
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004231 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232 goto onError;
4233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004234 Py_XDECREF(errorHandler);
4235 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004236 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004237
Benjamin Peterson29060642009-01-31 22:14:21 +00004238 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239 Py_XDECREF(errorHandler);
4240 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241 Py_DECREF(unicode);
4242 return NULL;
4243}
4244
4245
Alexander Belopolsky40018472011-02-26 01:02:56 +00004246PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004247_PyUnicode_EncodeUTF7(PyObject *str,
4248 int base64SetO,
4249 int base64WhiteSpace,
4250 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004251{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004252 int kind;
4253 void *data;
4254 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004255 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004256 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004257 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004258 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004259 unsigned int base64bits = 0;
4260 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004261 char * out;
4262 char * start;
4263
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004264 if (PyUnicode_READY(str) < 0)
4265 return NULL;
4266 kind = PyUnicode_KIND(str);
4267 data = PyUnicode_DATA(str);
4268 len = PyUnicode_GET_LENGTH(str);
4269
4270 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004273 /* It might be possible to tighten this worst case */
4274 allocated = 8 * len;
4275 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004276 return PyErr_NoMemory();
4277
Antoine Pitrou244651a2009-05-04 18:56:13 +00004278 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279 if (v == NULL)
4280 return NULL;
4281
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004282 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004283 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004284 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286 if (inShift) {
4287 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4288 /* shifting out */
4289 if (base64bits) { /* output remaining bits */
4290 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4291 base64buffer = 0;
4292 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293 }
4294 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 /* Characters not in the BASE64 set implicitly unshift the sequence
4296 so no '-' is required, except if the character is itself a '-' */
4297 if (IS_BASE64(ch) || ch == '-') {
4298 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004299 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004300 *out++ = (char) ch;
4301 }
4302 else {
4303 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004304 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004305 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 else { /* not in a shift sequence */
4307 if (ch == '+') {
4308 *out++ = '+';
4309 *out++ = '-';
4310 }
4311 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4312 *out++ = (char) ch;
4313 }
4314 else {
4315 *out++ = '+';
4316 inShift = 1;
4317 goto encode_char;
4318 }
4319 }
4320 continue;
4321encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004323 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004324
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 /* code first surrogate */
4326 base64bits += 16;
4327 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4328 while (base64bits >= 6) {
4329 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4330 base64bits -= 6;
4331 }
4332 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004333 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335 base64bits += 16;
4336 base64buffer = (base64buffer << 16) | ch;
4337 while (base64bits >= 6) {
4338 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4339 base64bits -= 6;
4340 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004341 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 if (base64bits)
4343 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4344 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004346 if (_PyBytes_Resize(&v, out - start) < 0)
4347 return NULL;
4348 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004350PyObject *
4351PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4352 Py_ssize_t size,
4353 int base64SetO,
4354 int base64WhiteSpace,
4355 const char *errors)
4356{
4357 PyObject *result;
4358 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4359 if (tmp == NULL)
4360 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004361 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004362 base64WhiteSpace, errors);
4363 Py_DECREF(tmp);
4364 return result;
4365}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367#undef IS_BASE64
4368#undef FROM_BASE64
4369#undef TO_BASE64
4370#undef DECODE_DIRECT
4371#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373/* --- UTF-8 Codec -------------------------------------------------------- */
4374
Tim Petersced69f82003-09-16 20:30:58 +00004375static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004377 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4378 illegal prefix. See RFC 3629 for details */
4379 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4380 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004381 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4383 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4384 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4385 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004386 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4387 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4389 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004390 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4391 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4392 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4393 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4394 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395};
4396
Alexander Belopolsky40018472011-02-26 01:02:56 +00004397PyObject *
4398PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004399 Py_ssize_t size,
4400 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401{
Walter Dörwald69652032004-09-07 20:24:22 +00004402 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4403}
4404
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004405#include "stringlib/ucs1lib.h"
4406#include "stringlib/codecs.h"
4407#include "stringlib/undef.h"
4408
4409#include "stringlib/ucs2lib.h"
4410#include "stringlib/codecs.h"
4411#include "stringlib/undef.h"
4412
4413#include "stringlib/ucs4lib.h"
4414#include "stringlib/codecs.h"
4415#include "stringlib/undef.h"
4416
Antoine Pitrouab868312009-01-10 15:40:25 +00004417/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4418#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4419
4420/* Mask to quickly check whether a C 'long' contains a
4421 non-ASCII, UTF8-encoded char. */
4422#if (SIZEOF_LONG == 8)
4423# define ASCII_CHAR_MASK 0x8080808080808080L
4424#elif (SIZEOF_LONG == 4)
4425# define ASCII_CHAR_MASK 0x80808080L
4426#else
4427# error C 'long' size should be either 4 or 8!
4428#endif
4429
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004430/* Scans a UTF-8 string and returns the maximum character to be expected
4431 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004432
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004433 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004434 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004435 */
4436static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004437utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004439 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 const unsigned char *end = p + string_size;
4441 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004442
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004443 assert(unicode_size != NULL);
4444
4445 /* By having a cascade of independent loops which fallback onto each
4446 other, we minimize the amount of work done in the average loop
4447 iteration, and we also maximize the CPU's ability to predict
4448 branches correctly (because a given condition will have always the
4449 same boolean outcome except perhaps in the last iteration of the
4450 corresponding loop).
4451 In the general case this brings us rather close to decoding
4452 performance pre-PEP 393, despite the two-pass decoding.
4453
4454 Note that the pure ASCII loop is not duplicated once a non-ASCII
4455 character has been encountered. It is actually a pessimization (by
4456 a significant factor) to use this loop on text with many non-ASCII
4457 characters, and it is important to avoid bad performance on valid
4458 utf-8 data (invalid utf-8 being a different can of worms).
4459 */
4460
4461 /* ASCII */
4462 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 /* Only check value if it's not a ASCII char... */
4464 if (*p < 0x80) {
4465 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4466 an explanation. */
4467 if (!((size_t) p & LONG_PTR_MASK)) {
4468 /* Help register allocation */
4469 register const unsigned char *_p = p;
4470 while (_p < aligned_end) {
4471 unsigned long value = *(unsigned long *) _p;
4472 if (value & ASCII_CHAR_MASK)
4473 break;
4474 _p += SIZEOF_LONG;
4475 char_count += SIZEOF_LONG;
4476 }
4477 p = _p;
4478 if (p == end)
4479 break;
4480 }
4481 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004482 if (*p < 0x80)
4483 ++char_count;
4484 else
4485 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004486 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004487 *unicode_size = char_count;
4488 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004489
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004490_ucs1loop:
4491 for (; p < end; ++p) {
4492 if (*p < 0xc4)
4493 char_count += ((*p & 0xc0) != 0x80);
4494 else
4495 goto _ucs2loop;
4496 }
4497 *unicode_size = char_count;
4498 return 255;
4499
4500_ucs2loop:
4501 for (; p < end; ++p) {
4502 if (*p < 0xf0)
4503 char_count += ((*p & 0xc0) != 0x80);
4504 else
4505 goto _ucs4loop;
4506 }
4507 *unicode_size = char_count;
4508 return 65535;
4509
4510_ucs4loop:
4511 for (; p < end; ++p) {
4512 char_count += ((*p & 0xc0) != 0x80);
4513 }
4514 *unicode_size = char_count;
4515 return 65537;
4516}
4517
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004518/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004519 in case of errors. Implicit parameters: unicode, kind, data, onError.
4520 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004521*/
Victor Stinner785938e2011-12-11 20:09:03 +01004522#define WRITE_MAYBE_FAIL(index, value) \
4523 do { \
4524 Py_ssize_t pos = index; \
4525 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4526 unicode_resize(&unicode, pos + pos/8) < 0) \
4527 goto onError; \
4528 if (unicode_putchar(&unicode, &pos, value) < 0) \
4529 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 } while (0)
4531
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004532static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004533decode_utf8_errors(const char *starts,
4534 Py_ssize_t size,
4535 const char *errors,
4536 Py_ssize_t *consumed,
4537 const char *s,
4538 PyObject *unicode,
4539 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004540{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004542 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004543 Py_ssize_t startinpos;
4544 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004545 const char *e = starts + size;
4546 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004547 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 PyObject *errorHandler = NULL;
4549 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004550
Antoine Pitrouab868312009-01-10 15:40:25 +00004551 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552
4553 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004554 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555
4556 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004557 /* Fast path for runs of ASCII characters. Given that common UTF-8
4558 input will consist of an overwhelming majority of ASCII
4559 characters, we try to optimize for this case by checking
4560 as many characters as a C 'long' can contain.
4561 First, check if we can do an aligned read, as most CPUs have
4562 a penalty for unaligned reads.
4563 */
4564 if (!((size_t) s & LONG_PTR_MASK)) {
4565 /* Help register allocation */
4566 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004567 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004568 while (_s < aligned_end) {
4569 /* Read a whole long at a time (either 4 or 8 bytes),
4570 and do a fast unrolled copy if it only contains ASCII
4571 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004572 unsigned long value = *(unsigned long *) _s;
4573 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004574 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004575 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4576 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4577 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4578 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004579#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004580 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4581 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4582 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4583 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004584#endif
4585 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004586 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004587 }
4588 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004589 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004590 if (s == e)
4591 break;
4592 ch = (unsigned char)*s;
4593 }
4594 }
4595
4596 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 s++;
4599 continue;
4600 }
4601
4602 n = utf8_code_length[ch];
4603
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004604 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 if (consumed)
4606 break;
4607 else {
4608 errmsg = "unexpected end of data";
4609 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004610 endinpos = startinpos+1;
4611 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4612 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 goto utf8Error;
4614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
4617 switch (n) {
4618
4619 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004620 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 startinpos = s-starts;
4622 endinpos = startinpos+1;
4623 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
4625 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004626 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 startinpos = s-starts;
4628 endinpos = startinpos+1;
4629 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
4631 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004632 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004633 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004634 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004635 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004636 goto utf8Error;
4637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004639 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004640 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641 break;
4642
4643 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004644 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4645 will result in surrogates in range d800-dfff. Surrogates are
4646 not valid UTF-8 so they are rejected.
4647 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4648 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004649 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004650 (s[2] & 0xc0) != 0x80 ||
4651 ((unsigned char)s[0] == 0xE0 &&
4652 (unsigned char)s[1] < 0xA0) ||
4653 ((unsigned char)s[0] == 0xED &&
4654 (unsigned char)s[1] > 0x9F)) {
4655 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004657 endinpos = startinpos + 1;
4658
4659 /* if s[1] first two bits are 1 and 0, then the invalid
4660 continuation byte is s[2], so increment endinpos by 1,
4661 if not, s[1] is invalid and endinpos doesn't need to
4662 be incremented. */
4663 if ((s[1] & 0xC0) == 0x80)
4664 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 goto utf8Error;
4666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004668 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004669 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004670 break;
4671
4672 case 4:
4673 if ((s[1] & 0xc0) != 0x80 ||
4674 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004675 (s[3] & 0xc0) != 0x80 ||
4676 ((unsigned char)s[0] == 0xF0 &&
4677 (unsigned char)s[1] < 0x90) ||
4678 ((unsigned char)s[0] == 0xF4 &&
4679 (unsigned char)s[1] > 0x8F)) {
4680 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004682 endinpos = startinpos + 1;
4683 if ((s[1] & 0xC0) == 0x80) {
4684 endinpos++;
4685 if ((s[2] & 0xC0) == 0x80)
4686 endinpos++;
4687 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 goto utf8Error;
4689 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004690 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004691 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004692 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004693
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004694 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 }
4697 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004699
Benjamin Peterson29060642009-01-31 22:14:21 +00004700 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 if (unicode_decode_call_errorhandler(
4702 errors, &errorHandler,
4703 "utf8", errmsg,
4704 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004705 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004706 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004707 /* Update data because unicode_decode_call_errorhandler might have
4708 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004709 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710 }
Walter Dörwald69652032004-09-07 20:24:22 +00004711 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004712 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004714 /* Adjust length and ready string when it contained errors and
4715 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004716 if (unicode_resize(&unicode, i) < 0)
4717 goto onError;
4718 unicode_adjust_maxchar(&unicode);
4719 if (unicode == NULL)
4720 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 Py_XDECREF(errorHandler);
4723 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004724 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004725 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 Py_XDECREF(errorHandler);
4729 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004730 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731 return NULL;
4732}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004733#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004734
Victor Stinner785938e2011-12-11 20:09:03 +01004735PyObject *
4736PyUnicode_DecodeUTF8Stateful(const char *s,
4737 Py_ssize_t size,
4738 const char *errors,
4739 Py_ssize_t *consumed)
4740{
4741 Py_UCS4 maxchar = 0;
4742 Py_ssize_t unicode_size;
4743 int has_errors = 0;
4744 PyObject *unicode;
4745 int kind;
4746 void *data;
4747 const char *starts = s;
4748 const char *e;
4749 Py_ssize_t i;
4750
4751 if (size == 0) {
4752 if (consumed)
4753 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004754 Py_INCREF(unicode_empty);
4755 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004756 }
4757
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004758 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004759
4760 /* When the string is ASCII only, just use memcpy and return.
4761 unicode_size may be != size if there is an incomplete UTF-8
4762 sequence at the end of the ASCII block. */
4763 if (maxchar < 128 && size == unicode_size) {
4764 if (consumed)
4765 *consumed = size;
4766 return unicode_fromascii(s, size);
4767 }
4768
4769 unicode = PyUnicode_New(unicode_size, maxchar);
4770 if (!unicode)
4771 return NULL;
4772 kind = PyUnicode_KIND(unicode);
4773 data = PyUnicode_DATA(unicode);
4774
4775 /* Unpack UTF-8 encoded data */
4776 i = 0;
4777 e = starts + size;
4778 switch (kind) {
4779 case PyUnicode_1BYTE_KIND:
4780 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4781 break;
4782 case PyUnicode_2BYTE_KIND:
4783 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4784 break;
4785 case PyUnicode_4BYTE_KIND:
4786 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4787 break;
4788 }
4789 if (!has_errors) {
4790 /* Ensure the unicode size calculation was correct */
4791 assert(i == unicode_size);
4792 assert(s == e);
4793 if (consumed)
4794 *consumed = size;
4795 return unicode;
4796 }
4797
4798 /* In case of errors, maxchar and size computation might be incorrect;
4799 code below refits and resizes as necessary. */
4800 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4801}
4802
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004803#ifdef __APPLE__
4804
4805/* Simplified UTF-8 decoder using surrogateescape error handler,
4806 used to decode the command line arguments on Mac OS X. */
4807
4808wchar_t*
4809_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4810{
4811 int n;
4812 const char *e;
4813 wchar_t *unicode, *p;
4814
4815 /* Note: size will always be longer than the resulting Unicode
4816 character count */
4817 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4818 PyErr_NoMemory();
4819 return NULL;
4820 }
4821 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4822 if (!unicode)
4823 return NULL;
4824
4825 /* Unpack UTF-8 encoded data */
4826 p = unicode;
4827 e = s + size;
4828 while (s < e) {
4829 Py_UCS4 ch = (unsigned char)*s;
4830
4831 if (ch < 0x80) {
4832 *p++ = (wchar_t)ch;
4833 s++;
4834 continue;
4835 }
4836
4837 n = utf8_code_length[ch];
4838 if (s + n > e) {
4839 goto surrogateescape;
4840 }
4841
4842 switch (n) {
4843 case 0:
4844 case 1:
4845 goto surrogateescape;
4846
4847 case 2:
4848 if ((s[1] & 0xc0) != 0x80)
4849 goto surrogateescape;
4850 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4851 assert ((ch > 0x007F) && (ch <= 0x07FF));
4852 *p++ = (wchar_t)ch;
4853 break;
4854
4855 case 3:
4856 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4857 will result in surrogates in range d800-dfff. Surrogates are
4858 not valid UTF-8 so they are rejected.
4859 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4860 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4861 if ((s[1] & 0xc0) != 0x80 ||
4862 (s[2] & 0xc0) != 0x80 ||
4863 ((unsigned char)s[0] == 0xE0 &&
4864 (unsigned char)s[1] < 0xA0) ||
4865 ((unsigned char)s[0] == 0xED &&
4866 (unsigned char)s[1] > 0x9F)) {
4867
4868 goto surrogateescape;
4869 }
4870 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4871 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004872 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004873 break;
4874
4875 case 4:
4876 if ((s[1] & 0xc0) != 0x80 ||
4877 (s[2] & 0xc0) != 0x80 ||
4878 (s[3] & 0xc0) != 0x80 ||
4879 ((unsigned char)s[0] == 0xF0 &&
4880 (unsigned char)s[1] < 0x90) ||
4881 ((unsigned char)s[0] == 0xF4 &&
4882 (unsigned char)s[1] > 0x8F)) {
4883 goto surrogateescape;
4884 }
4885 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4886 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004887 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004888
4889#if SIZEOF_WCHAR_T == 4
4890 *p++ = (wchar_t)ch;
4891#else
4892 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004893 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4894 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004895#endif
4896 break;
4897 }
4898 s += n;
4899 continue;
4900
4901 surrogateescape:
4902 *p++ = 0xDC00 + ch;
4903 s++;
4904 }
4905 *p = L'\0';
4906 return unicode;
4907}
4908
4909#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004911/* Primary internal function which creates utf8 encoded bytes objects.
4912
4913 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004914 and allocate exactly as much space needed at the end. Else allocate the
4915 maximum possible needed (4 result bytes per Unicode character), and return
4916 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004917*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004918PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004919_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
Tim Peters602f7402002-04-27 18:03:26 +00004921#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004922
Guido van Rossum98297ee2007-11-06 21:34:58 +00004923 Py_ssize_t i; /* index into s of next input byte */
4924 PyObject *result; /* result string object */
4925 char *p; /* next free byte in output buffer */
4926 Py_ssize_t nallocated; /* number of result bytes allocated */
4927 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004928 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004929 PyObject *errorHandler = NULL;
4930 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004931 int kind;
4932 void *data;
4933 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004934 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936 if (!PyUnicode_Check(unicode)) {
4937 PyErr_BadArgument();
4938 return NULL;
4939 }
4940
4941 if (PyUnicode_READY(unicode) == -1)
4942 return NULL;
4943
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004944 if (PyUnicode_UTF8(unicode))
4945 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4946 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947
4948 kind = PyUnicode_KIND(unicode);
4949 data = PyUnicode_DATA(unicode);
4950 size = PyUnicode_GET_LENGTH(unicode);
4951
Tim Peters602f7402002-04-27 18:03:26 +00004952 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953
Tim Peters602f7402002-04-27 18:03:26 +00004954 if (size <= MAX_SHORT_UNICHARS) {
4955 /* Write into the stack buffer; nallocated can't overflow.
4956 * At the end, we'll allocate exactly as much heap space as it
4957 * turns out we need.
4958 */
4959 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004960 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004961 p = stackbuf;
4962 }
4963 else {
4964 /* Overallocate on the heap, and give the excess back at the end. */
4965 nallocated = size * 4;
4966 if (nallocated / 4 != size) /* overflow! */
4967 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004968 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004969 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004970 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004971 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004972 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004973
Tim Peters602f7402002-04-27 18:03:26 +00004974 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004975 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004976
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004977 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004978 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004980
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004982 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004983 *p++ = (char)(0xc0 | (ch >> 6));
4984 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004985 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004986 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004987 Py_ssize_t repsize, k, startpos;
4988 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004989 rep = unicode_encode_call_errorhandler(
4990 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004991 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004992 if (!rep)
4993 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004995 if (PyBytes_Check(rep))
4996 repsize = PyBytes_GET_SIZE(rep);
4997 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004998 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004999
5000 if (repsize > 4) {
5001 Py_ssize_t offset;
5002
5003 if (result == NULL)
5004 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00005005 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005006 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00005007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005008 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
5009 /* integer overflow */
5010 PyErr_NoMemory();
5011 goto error;
5012 }
5013 nallocated += repsize - 4;
5014 if (result != NULL) {
5015 if (_PyBytes_Resize(&result, nallocated) < 0)
5016 goto error;
5017 } else {
5018 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00005019 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005020 goto error;
5021 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
5022 }
5023 p = PyBytes_AS_STRING(result) + offset;
5024 }
Victor Stinner31be90b2010-04-22 19:38:16 +00005025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005026 if (PyBytes_Check(rep)) {
5027 char *prep = PyBytes_AS_STRING(rep);
5028 for(k = repsize; k > 0; k--)
5029 *p++ = *prep++;
5030 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01005031 enum PyUnicode_Kind repkind;
5032 void *repdata;
5033
Antoine Pitrou31b92a52011-11-12 18:35:19 +01005034 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01005035 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01005036 repkind = PyUnicode_KIND(rep);
5037 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005038
5039 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01005040 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005041 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01005042 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01005043 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01005044 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005045 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00005046 goto error;
5047 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01005048 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00005049 }
Victor Stinner31be90b2010-04-22 19:38:16 +00005050 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01005051 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00005052 } else if (ch < 0x10000) {
5053 *p++ = (char)(0xe0 | (ch >> 12));
5054 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5055 *p++ = (char)(0x80 | (ch & 0x3f));
5056 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01005057 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00005058 /* Encode UCS4 Unicode ordinals */
5059 *p++ = (char)(0xf0 | (ch >> 18));
5060 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5061 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5062 *p++ = (char)(0x80 | (ch & 0x3f));
5063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 }
Tim Peters0eca65c2002-04-21 17:28:06 +00005065
Guido van Rossum98297ee2007-11-06 21:34:58 +00005066 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00005067 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005068 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00005069 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00005070 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00005071 }
5072 else {
Christian Heimesf3863112007-11-22 07:46:41 +00005073 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00005074 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00005075 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00005076 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00005077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005078
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005079 Py_XDECREF(errorHandler);
5080 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005081 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005082 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01005083 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 Py_XDECREF(result);
5087 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005088
Tim Peters602f7402002-04-27 18:03:26 +00005089#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090}
5091
Alexander Belopolsky40018472011-02-26 01:02:56 +00005092PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005093PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5094 Py_ssize_t size,
5095 const char *errors)
5096{
5097 PyObject *v, *unicode;
5098
5099 unicode = PyUnicode_FromUnicode(s, size);
5100 if (unicode == NULL)
5101 return NULL;
5102 v = _PyUnicode_AsUTF8String(unicode, errors);
5103 Py_DECREF(unicode);
5104 return v;
5105}
5106
5107PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005108PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005110 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111}
5112
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113/* --- UTF-32 Codec ------------------------------------------------------- */
5114
5115PyObject *
5116PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005117 Py_ssize_t size,
5118 const char *errors,
5119 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120{
5121 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5122}
5123
5124PyObject *
5125PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 Py_ssize_t size,
5127 const char *errors,
5128 int *byteorder,
5129 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130{
5131 const char *starts = s;
5132 Py_ssize_t startinpos;
5133 Py_ssize_t endinpos;
5134 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005135 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005136 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137 int bo = 0; /* assume native ordering by default */
5138 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139 /* Offsets from q for retrieving bytes in the right order. */
5140#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5141 int iorder[] = {0, 1, 2, 3};
5142#else
5143 int iorder[] = {3, 2, 1, 0};
5144#endif
5145 PyObject *errorHandler = NULL;
5146 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005147
Walter Dörwald41980ca2007-08-16 21:55:45 +00005148 q = (unsigned char *)s;
5149 e = q + size;
5150
5151 if (byteorder)
5152 bo = *byteorder;
5153
5154 /* Check for BOM marks (U+FEFF) in the input and adjust current
5155 byte order setting accordingly. In native mode, the leading BOM
5156 mark is skipped, in all other modes, it is copied to the output
5157 stream as-is (giving a ZWNBSP character). */
5158 if (bo == 0) {
5159 if (size >= 4) {
5160 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005162#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 if (bom == 0x0000FEFF) {
5164 q += 4;
5165 bo = -1;
5166 }
5167 else if (bom == 0xFFFE0000) {
5168 q += 4;
5169 bo = 1;
5170 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005171#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 if (bom == 0x0000FEFF) {
5173 q += 4;
5174 bo = 1;
5175 }
5176 else if (bom == 0xFFFE0000) {
5177 q += 4;
5178 bo = -1;
5179 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005180#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005181 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005182 }
5183
5184 if (bo == -1) {
5185 /* force LE */
5186 iorder[0] = 0;
5187 iorder[1] = 1;
5188 iorder[2] = 2;
5189 iorder[3] = 3;
5190 }
5191 else if (bo == 1) {
5192 /* force BE */
5193 iorder[0] = 3;
5194 iorder[1] = 2;
5195 iorder[2] = 1;
5196 iorder[3] = 0;
5197 }
5198
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005199 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005200 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005201 if (!unicode)
5202 return NULL;
5203 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005204 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005205 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005206
Walter Dörwald41980ca2007-08-16 21:55:45 +00005207 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 Py_UCS4 ch;
5209 /* remaining bytes at the end? (size should be divisible by 4) */
5210 if (e-q<4) {
5211 if (consumed)
5212 break;
5213 errmsg = "truncated data";
5214 startinpos = ((const char *)q)-starts;
5215 endinpos = ((const char *)e)-starts;
5216 goto utf32Error;
5217 /* The remaining input chars are ignored if the callback
5218 chooses to skip the input */
5219 }
5220 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5221 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005222
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 if (ch >= 0x110000)
5224 {
5225 errmsg = "codepoint not in range(0x110000)";
5226 startinpos = ((const char *)q)-starts;
5227 endinpos = startinpos+4;
5228 goto utf32Error;
5229 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005230 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5231 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 q += 4;
5233 continue;
5234 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (unicode_decode_call_errorhandler(
5236 errors, &errorHandler,
5237 "utf32", errmsg,
5238 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005239 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 }
5242
5243 if (byteorder)
5244 *byteorder = bo;
5245
5246 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005248
5249 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005250 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005251 goto onError;
5252
5253 Py_XDECREF(errorHandler);
5254 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005255 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005256
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005258 Py_DECREF(unicode);
5259 Py_XDECREF(errorHandler);
5260 Py_XDECREF(exc);
5261 return NULL;
5262}
5263
5264PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005265_PyUnicode_EncodeUTF32(PyObject *str,
5266 const char *errors,
5267 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269 int kind;
5270 void *data;
5271 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005272 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005274 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005275 /* Offsets from p for storing byte pairs in the right order. */
5276#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5277 int iorder[] = {0, 1, 2, 3};
5278#else
5279 int iorder[] = {3, 2, 1, 0};
5280#endif
5281
Benjamin Peterson29060642009-01-31 22:14:21 +00005282#define STORECHAR(CH) \
5283 do { \
5284 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5285 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5286 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5287 p[iorder[0]] = (CH) & 0xff; \
5288 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005289 } while(0)
5290
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005291 if (!PyUnicode_Check(str)) {
5292 PyErr_BadArgument();
5293 return NULL;
5294 }
5295 if (PyUnicode_READY(str) < 0)
5296 return NULL;
5297 kind = PyUnicode_KIND(str);
5298 data = PyUnicode_DATA(str);
5299 len = PyUnicode_GET_LENGTH(str);
5300
5301 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005302 bytesize = nsize * 4;
5303 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005305 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005306 if (v == NULL)
5307 return NULL;
5308
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005309 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005310 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005312 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005313 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005314
5315 if (byteorder == -1) {
5316 /* force LE */
5317 iorder[0] = 0;
5318 iorder[1] = 1;
5319 iorder[2] = 2;
5320 iorder[3] = 3;
5321 }
5322 else if (byteorder == 1) {
5323 /* force BE */
5324 iorder[0] = 3;
5325 iorder[1] = 2;
5326 iorder[2] = 1;
5327 iorder[3] = 0;
5328 }
5329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005330 for (i = 0; i < len; i++)
5331 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005332
5333 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005334 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005335#undef STORECHAR
5336}
5337
Alexander Belopolsky40018472011-02-26 01:02:56 +00005338PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005339PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5340 Py_ssize_t size,
5341 const char *errors,
5342 int byteorder)
5343{
5344 PyObject *result;
5345 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5346 if (tmp == NULL)
5347 return NULL;
5348 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5349 Py_DECREF(tmp);
5350 return result;
5351}
5352
5353PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005354PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005355{
Victor Stinnerb960b342011-11-20 19:12:52 +01005356 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005357}
5358
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359/* --- UTF-16 Codec ------------------------------------------------------- */
5360
Tim Peters772747b2001-08-09 22:21:55 +00005361PyObject *
5362PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 Py_ssize_t size,
5364 const char *errors,
5365 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366{
Walter Dörwald69652032004-09-07 20:24:22 +00005367 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5368}
5369
Antoine Pitrouab868312009-01-10 15:40:25 +00005370/* Two masks for fast checking of whether a C 'long' may contain
5371 UTF16-encoded surrogate characters. This is an efficient heuristic,
5372 assuming that non-surrogate characters with a code point >= 0x8000 are
5373 rare in most input.
5374 FAST_CHAR_MASK is used when the input is in native byte ordering,
5375 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005376*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005377#if (SIZEOF_LONG == 8)
5378# define FAST_CHAR_MASK 0x8000800080008000L
5379# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5380#elif (SIZEOF_LONG == 4)
5381# define FAST_CHAR_MASK 0x80008000L
5382# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5383#else
5384# error C 'long' size should be either 4 or 8!
5385#endif
5386
Walter Dörwald69652032004-09-07 20:24:22 +00005387PyObject *
5388PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 Py_ssize_t size,
5390 const char *errors,
5391 int *byteorder,
5392 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005393{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005394 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005395 Py_ssize_t startinpos;
5396 Py_ssize_t endinpos;
5397 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005398 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005399 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005400 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005401 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005402 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005403 /* Offsets from q for retrieving byte pairs in the right order. */
5404#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5405 int ihi = 1, ilo = 0;
5406#else
5407 int ihi = 0, ilo = 1;
5408#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005409 PyObject *errorHandler = NULL;
5410 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411
5412 /* Note: size will always be longer than the resulting Unicode
5413 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005414 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415 if (!unicode)
5416 return NULL;
5417 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005418 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005419 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
Tim Peters772747b2001-08-09 22:21:55 +00005421 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005422 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423
5424 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005425 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005427 /* Check for BOM marks (U+FEFF) in the input and adjust current
5428 byte order setting accordingly. In native mode, the leading BOM
5429 mark is skipped, in all other modes, it is copied to the output
5430 stream as-is (giving a ZWNBSP character). */
5431 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005432 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005433 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005434#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005435 if (bom == 0xFEFF) {
5436 q += 2;
5437 bo = -1;
5438 }
5439 else if (bom == 0xFFFE) {
5440 q += 2;
5441 bo = 1;
5442 }
Tim Petersced69f82003-09-16 20:30:58 +00005443#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005444 if (bom == 0xFEFF) {
5445 q += 2;
5446 bo = 1;
5447 }
5448 else if (bom == 0xFFFE) {
5449 q += 2;
5450 bo = -1;
5451 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005452#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455
Tim Peters772747b2001-08-09 22:21:55 +00005456 if (bo == -1) {
5457 /* force LE */
5458 ihi = 1;
5459 ilo = 0;
5460 }
5461 else if (bo == 1) {
5462 /* force BE */
5463 ihi = 0;
5464 ilo = 1;
5465 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005466#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5467 native_ordering = ilo < ihi;
5468#else
5469 native_ordering = ilo > ihi;
5470#endif
Tim Peters772747b2001-08-09 22:21:55 +00005471
Antoine Pitrouab868312009-01-10 15:40:25 +00005472 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005473 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005474 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005475 /* First check for possible aligned read of a C 'long'. Unaligned
5476 reads are more expensive, better to defer to another iteration. */
5477 if (!((size_t) q & LONG_PTR_MASK)) {
5478 /* Fast path for runs of non-surrogate chars. */
5479 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005480 int kind = PyUnicode_KIND(unicode);
5481 void *data = PyUnicode_DATA(unicode);
5482 while (_q < aligned_end) {
5483 unsigned long block = * (unsigned long *) _q;
5484 unsigned short *pblock = (unsigned short*)&block;
5485 Py_UCS4 maxch;
5486 if (native_ordering) {
5487 /* Can use buffer directly */
5488 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005489 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005490 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005491 else {
5492 /* Need to byte-swap */
5493 unsigned char *_p = (unsigned char*)pblock;
5494 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005495 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005496 _p[0] = _q[1];
5497 _p[1] = _q[0];
5498 _p[2] = _q[3];
5499 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005500#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005501 _p[4] = _q[5];
5502 _p[5] = _q[4];
5503 _p[6] = _q[7];
5504 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005505#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005506 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005507 maxch = Py_MAX(pblock[0], pblock[1]);
5508#if SIZEOF_LONG == 8
5509 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5510#endif
5511 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5512 if (unicode_widen(&unicode, maxch) < 0)
5513 goto onError;
5514 kind = PyUnicode_KIND(unicode);
5515 data = PyUnicode_DATA(unicode);
5516 }
5517 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5518 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5519#if SIZEOF_LONG == 8
5520 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5521 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5522#endif
5523 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005524 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005525 q = _q;
5526 if (q >= e)
5527 break;
5528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005530
Benjamin Peterson14339b62009-01-31 16:36:08 +00005531 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005532
Victor Stinner551ac952011-11-29 22:58:13 +01005533 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005534 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5535 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 continue;
5537 }
5538
5539 /* UTF-16 code pair: */
5540 if (q > e) {
5541 errmsg = "unexpected end of data";
5542 startinpos = (((const char *)q) - 2) - starts;
5543 endinpos = ((const char *)e) + 1 - starts;
5544 goto utf16Error;
5545 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005546 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5547 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005549 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005550 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005551 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 continue;
5554 }
5555 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005556 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 startinpos = (((const char *)q)-4)-starts;
5558 endinpos = startinpos+2;
5559 goto utf16Error;
5560 }
5561
Benjamin Peterson14339b62009-01-31 16:36:08 +00005562 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 errmsg = "illegal encoding";
5564 startinpos = (((const char *)q)-2)-starts;
5565 endinpos = startinpos+2;
5566 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005567
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005570 errors,
5571 &errorHandler,
5572 "utf16", errmsg,
5573 &starts,
5574 (const char **)&e,
5575 &startinpos,
5576 &endinpos,
5577 &exc,
5578 (const char **)&q,
5579 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005583 /* remaining byte at the end? (size should be even) */
5584 if (e == q) {
5585 if (!consumed) {
5586 errmsg = "truncated data";
5587 startinpos = ((const char *)q) - starts;
5588 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005589 if (unicode_decode_call_errorhandler(
5590 errors,
5591 &errorHandler,
5592 "utf16", errmsg,
5593 &starts,
5594 (const char **)&e,
5595 &startinpos,
5596 &endinpos,
5597 &exc,
5598 (const char **)&q,
5599 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005600 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005601 goto onError;
5602 /* The remaining input chars are ignored if the callback
5603 chooses to skip the input */
5604 }
5605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606
5607 if (byteorder)
5608 *byteorder = bo;
5609
Walter Dörwald69652032004-09-07 20:24:22 +00005610 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005612
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005614 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 goto onError;
5616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005617 Py_XDECREF(errorHandler);
5618 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005619 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 Py_XDECREF(errorHandler);
5624 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 return NULL;
5626}
5627
Antoine Pitrouab868312009-01-10 15:40:25 +00005628#undef FAST_CHAR_MASK
5629#undef SWAPPED_FAST_CHAR_MASK
5630
Tim Peters772747b2001-08-09 22:21:55 +00005631PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632_PyUnicode_EncodeUTF16(PyObject *str,
5633 const char *errors,
5634 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005636 int kind;
5637 void *data;
5638 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005639 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005640 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005641 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005642 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005643 /* Offsets from p for storing byte pairs in the right order. */
5644#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5645 int ihi = 1, ilo = 0;
5646#else
5647 int ihi = 0, ilo = 1;
5648#endif
5649
Benjamin Peterson29060642009-01-31 22:14:21 +00005650#define STORECHAR(CH) \
5651 do { \
5652 p[ihi] = ((CH) >> 8) & 0xff; \
5653 p[ilo] = (CH) & 0xff; \
5654 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005655 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005657 if (!PyUnicode_Check(str)) {
5658 PyErr_BadArgument();
5659 return NULL;
5660 }
5661 if (PyUnicode_READY(str) < 0)
5662 return NULL;
5663 kind = PyUnicode_KIND(str);
5664 data = PyUnicode_DATA(str);
5665 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005666
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005667 pairs = 0;
5668 if (kind == PyUnicode_4BYTE_KIND)
5669 for (i = 0; i < len; i++)
5670 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5671 pairs++;
5672 /* 2 * (len + pairs + (byteorder == 0)) */
5673 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005675 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005676 bytesize = nsize * 2;
5677 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005679 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 if (v == NULL)
5681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005683 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005687 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005688
5689 if (byteorder == -1) {
5690 /* force LE */
5691 ihi = 1;
5692 ilo = 0;
5693 }
5694 else if (byteorder == 1) {
5695 /* force BE */
5696 ihi = 0;
5697 ilo = 1;
5698 }
5699
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005700 for (i = 0; i < len; i++) {
5701 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5702 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005704 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5705 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 }
Tim Peters772747b2001-08-09 22:21:55 +00005707 STORECHAR(ch);
5708 if (ch2)
5709 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005710 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005711
5712 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715}
5716
Alexander Belopolsky40018472011-02-26 01:02:56 +00005717PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005718PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5719 Py_ssize_t size,
5720 const char *errors,
5721 int byteorder)
5722{
5723 PyObject *result;
5724 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5725 if (tmp == NULL)
5726 return NULL;
5727 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5728 Py_DECREF(tmp);
5729 return result;
5730}
5731
5732PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005733PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005735 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736}
5737
5738/* --- Unicode Escape Codec ----------------------------------------------- */
5739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005740/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5741 if all the escapes in the string make it still a valid ASCII string.
5742 Returns -1 if any escapes were found which cause the string to
5743 pop out of ASCII range. Otherwise returns the length of the
5744 required buffer to hold the string.
5745 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005746static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005747length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5748{
5749 const unsigned char *p = (const unsigned char *)s;
5750 const unsigned char *end = p + size;
5751 Py_ssize_t length = 0;
5752
5753 if (size < 0)
5754 return -1;
5755
5756 for (; p < end; ++p) {
5757 if (*p > 127) {
5758 /* Non-ASCII */
5759 return -1;
5760 }
5761 else if (*p != '\\') {
5762 /* Normal character */
5763 ++length;
5764 }
5765 else {
5766 /* Backslash-escape, check next char */
5767 ++p;
5768 /* Escape sequence reaches till end of string or
5769 non-ASCII follow-up. */
5770 if (p >= end || *p > 127)
5771 return -1;
5772 switch (*p) {
5773 case '\n':
5774 /* backslash + \n result in zero characters */
5775 break;
5776 case '\\': case '\'': case '\"':
5777 case 'b': case 'f': case 't':
5778 case 'n': case 'r': case 'v': case 'a':
5779 ++length;
5780 break;
5781 case '0': case '1': case '2': case '3':
5782 case '4': case '5': case '6': case '7':
5783 case 'x': case 'u': case 'U': case 'N':
5784 /* these do not guarantee ASCII characters */
5785 return -1;
5786 default:
5787 /* count the backslash + the other character */
5788 length += 2;
5789 }
5790 }
5791 }
5792 return length;
5793}
5794
Fredrik Lundh06d12682001-01-24 07:59:11 +00005795static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005796
Alexander Belopolsky40018472011-02-26 01:02:56 +00005797PyObject *
5798PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005799 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005800 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005803 Py_ssize_t startinpos;
5804 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005806 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808 char* message;
5809 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 PyObject *errorHandler = NULL;
5811 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005813 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005814
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816
5817 /* After length_of_escaped_ascii_string() there are two alternatives,
5818 either the string is pure ASCII with named escapes like \n, etc.
5819 and we determined it's exact size (common case)
5820 or it contains \x, \u, ... escape sequences. then we create a
5821 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005822 if (len >= 0) {
5823 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 if (!v)
5825 goto onError;
5826 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005827 }
5828 else {
5829 /* Escaped strings will always be longer than the resulting
5830 Unicode string, so we start with size here and then reduce the
5831 length after conversion to the true value.
5832 (but if the error callback returns a long replacement string
5833 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005834 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 if (!v)
5836 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838 }
5839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005841 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005842 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005844
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 while (s < end) {
5846 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005847 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 /* The only case in which i == ascii_length is a backslash
5851 followed by a newline. */
5852 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005853
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 /* Non-escape characters are interpreted as Unicode ordinals */
5855 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005856 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5857 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 continue;
5859 }
5860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 /* \ - Escapes */
5863 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005864 c = *s++;
5865 if (s > end)
5866 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005867
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005868 /* The only case in which i == ascii_length is a backslash
5869 followed by a newline. */
5870 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005872 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875#define WRITECHAR(ch) \
5876 do { \
5877 if (unicode_putchar(&v, &i, ch) < 0) \
5878 goto onError; \
5879 }while(0)
5880
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005882 case '\\': WRITECHAR('\\'); break;
5883 case '\'': WRITECHAR('\''); break;
5884 case '\"': WRITECHAR('\"'); break;
5885 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005887 case 'f': WRITECHAR('\014'); break;
5888 case 't': WRITECHAR('\t'); break;
5889 case 'n': WRITECHAR('\n'); break;
5890 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005891 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005892 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005893 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005894 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 case '0': case '1': case '2': case '3':
5898 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005899 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005900 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005901 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005902 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005903 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 break;
5907
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 /* hex escapes */
5909 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005911 digits = 2;
5912 message = "truncated \\xXX escape";
5913 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005917 digits = 4;
5918 message = "truncated \\uXXXX escape";
5919 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005922 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005923 digits = 8;
5924 message = "truncated \\UXXXXXXXX escape";
5925 hexescape:
5926 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 if (s+digits>end) {
5928 endinpos = size;
5929 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 errors, &errorHandler,
5931 "unicodeescape", "end of string in escape sequence",
5932 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 goto onError;
5935 goto nextByte;
5936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005937 for (j = 0; j < digits; ++j) {
5938 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005939 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005940 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 errors, &errorHandler,
5943 "unicodeescape", message,
5944 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005945 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005946 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005947 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005948 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005949 }
5950 chr = (chr<<4) & ~0xF;
5951 if (c >= '0' && c <= '9')
5952 chr += c - '0';
5953 else if (c >= 'a' && c <= 'f')
5954 chr += 10 + c - 'a';
5955 else
5956 chr += 10 + c - 'A';
5957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005958 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005959 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 /* _decoding_error will have already written into the
5961 target buffer. */
5962 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005963 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005964 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005965 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005966 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005967 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 errors, &errorHandler,
5971 "unicodeescape", "illegal Unicode character",
5972 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005974 goto onError;
5975 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005976 break;
5977
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005979 case 'N':
5980 message = "malformed \\N character escape";
5981 if (ucnhash_CAPI == NULL) {
5982 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005983 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5984 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005985 if (ucnhash_CAPI == NULL)
5986 goto ucnhashError;
5987 }
5988 if (*s == '{') {
5989 const char *start = s+1;
5990 /* look for the closing brace */
5991 while (*s != '}' && s < end)
5992 s++;
5993 if (s > start && s < end && *s == '}') {
5994 /* found a name. look it up in the unicode database */
5995 message = "unknown Unicode character name";
5996 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005997 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005998 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005999 goto store;
6000 }
6001 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006003 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 errors, &errorHandler,
6005 "unicodeescape", message,
6006 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006008 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006009 break;
6010
6011 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006012 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006013 message = "\\ at end of string";
6014 s--;
6015 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 errors, &errorHandler,
6018 "unicodeescape", message,
6019 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006020 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006021 goto onError;
6022 }
6023 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006024 WRITECHAR('\\');
6025 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006026 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006027 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006032#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006033
Victor Stinner16e6a802011-12-12 13:24:15 +01006034 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006036 Py_XDECREF(errorHandler);
6037 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006038 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006039
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006041 PyErr_SetString(
6042 PyExc_UnicodeError,
6043 "\\N escapes not supported (can't load unicodedata module)"
6044 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006045 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 Py_XDECREF(errorHandler);
6047 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006048 return NULL;
6049
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 Py_XDECREF(errorHandler);
6053 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 return NULL;
6055}
6056
6057/* Return a Unicode-Escape string version of the Unicode object.
6058
6059 If quotes is true, the string is enclosed in u"" or u'' quotes as
6060 appropriate.
6061
6062*/
6063
Alexander Belopolsky40018472011-02-26 01:02:56 +00006064PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006068 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006070 int kind;
6071 void *data;
6072 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Thomas Wouters89f507f2006-12-13 04:49:30 +00006074 /* Initial allocation is based on the longest-possible unichr
6075 escape.
6076
6077 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6078 unichr, so in this case it's the longest unichr escape. In
6079 narrow (UTF-16) builds this is five chars per source unichr
6080 since there are two unichrs in the surrogate pair, so in narrow
6081 (UTF-16) builds it's not the longest unichr escape.
6082
6083 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6084 so in the narrow (UTF-16) build case it's the longest unichr
6085 escape.
6086 */
6087
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006088 if (!PyUnicode_Check(unicode)) {
6089 PyErr_BadArgument();
6090 return NULL;
6091 }
6092 if (PyUnicode_READY(unicode) < 0)
6093 return NULL;
6094 len = PyUnicode_GET_LENGTH(unicode);
6095 kind = PyUnicode_KIND(unicode);
6096 data = PyUnicode_DATA(unicode);
6097 switch(kind) {
6098 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6099 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6100 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6101 }
6102
6103 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006104 return PyBytes_FromStringAndSize(NULL, 0);
6105
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006106 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006109 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006111 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 if (repr == NULL)
6114 return NULL;
6115
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006116 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006119 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006120
Walter Dörwald79e913e2007-05-12 11:08:06 +00006121 /* Escape backslashes */
6122 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 *p++ = '\\';
6124 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006125 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006126 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006127
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006128 /* Map 21-bit characters to '\U00xxxxxx' */
6129 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006130 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006131 *p++ = '\\';
6132 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006133 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6134 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6135 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6136 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6137 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6138 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6139 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6140 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006142 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006143
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006145 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 *p++ = '\\';
6147 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006148 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6149 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6150 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6151 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006153
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006154 /* Map special whitespace to '\t', \n', '\r' */
6155 else if (ch == '\t') {
6156 *p++ = '\\';
6157 *p++ = 't';
6158 }
6159 else if (ch == '\n') {
6160 *p++ = '\\';
6161 *p++ = 'n';
6162 }
6163 else if (ch == '\r') {
6164 *p++ = '\\';
6165 *p++ = 'r';
6166 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006167
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006168 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006169 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006171 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006172 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6173 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006174 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006175
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 /* Copy everything else as-is */
6177 else
6178 *p++ = (char) ch;
6179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006181 assert(p - PyBytes_AS_STRING(repr) > 0);
6182 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6183 return NULL;
6184 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185}
6186
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6189 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006191 PyObject *result;
6192 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6193 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 result = PyUnicode_AsUnicodeEscapeString(tmp);
6196 Py_DECREF(tmp);
6197 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198}
6199
6200/* --- Raw Unicode Escape Codec ------------------------------------------- */
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
6203PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006204 Py_ssize_t size,
6205 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006208 Py_ssize_t startinpos;
6209 Py_ssize_t endinpos;
6210 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006211 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 const char *end;
6213 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 PyObject *errorHandler = NULL;
6215 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 /* Escaped strings will always be longer than the resulting
6218 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219 length after conversion to the true value. (But decoding error
6220 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006221 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006225 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006226 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 end = s + size;
6228 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 unsigned char c;
6230 Py_UCS4 x;
6231 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006232 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* Non-escape characters are interpreted as Unicode ordinals */
6235 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006236 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6237 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006239 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 startinpos = s-starts;
6241
6242 /* \u-escapes are only interpreted iff the number of leading
6243 backslashes if odd */
6244 bs = s;
6245 for (;s < end;) {
6246 if (*s != '\\')
6247 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006248 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6249 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 }
6251 if (((s - bs) & 1) == 0 ||
6252 s >= end ||
6253 (*s != 'u' && *s != 'U')) {
6254 continue;
6255 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006256 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 count = *s=='u' ? 4 : 8;
6258 s++;
6259
6260 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 for (x = 0, i = 0; i < count; ++i, ++s) {
6262 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006263 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 endinpos = s-starts;
6265 if (unicode_decode_call_errorhandler(
6266 errors, &errorHandler,
6267 "rawunicodeescape", "truncated \\uXXXX",
6268 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006269 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 goto onError;
6271 goto nextByte;
6272 }
6273 x = (x<<4) & ~0xF;
6274 if (c >= '0' && c <= '9')
6275 x += c - '0';
6276 else if (c >= 'a' && c <= 'f')
6277 x += 10 + c - 'a';
6278 else
6279 x += 10 + c - 'A';
6280 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006281 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006282 if (unicode_putchar(&v, &outpos, x) < 0)
6283 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006284 } else {
6285 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006286 if (unicode_decode_call_errorhandler(
6287 errors, &errorHandler,
6288 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006290 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 nextByte:
6294 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006296 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 Py_XDECREF(errorHandler);
6299 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006300 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006301
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 Py_XDECREF(errorHandler);
6305 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 return NULL;
6307}
6308
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309
Alexander Belopolsky40018472011-02-26 01:02:56 +00006310PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006311PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006313 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 char *p;
6315 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006316 Py_ssize_t expandsize, pos;
6317 int kind;
6318 void *data;
6319 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321 if (!PyUnicode_Check(unicode)) {
6322 PyErr_BadArgument();
6323 return NULL;
6324 }
6325 if (PyUnicode_READY(unicode) < 0)
6326 return NULL;
6327 kind = PyUnicode_KIND(unicode);
6328 data = PyUnicode_DATA(unicode);
6329 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006330 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6331 bytes, and 1 byte characters 4. */
6332 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006333
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006334 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006337 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 if (repr == NULL)
6339 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006341 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006343 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 for (pos = 0; pos < len; pos++) {
6345 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 /* Map 32-bit characters to '\Uxxxxxxxx' */
6347 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006348 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006349 *p++ = '\\';
6350 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006351 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6352 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6353 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6354 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6355 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6356 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6357 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6358 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006361 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 *p++ = '\\';
6363 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006364 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6365 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6366 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6367 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 /* Copy everything else as-is */
6370 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371 *p++ = (char) ch;
6372 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006373
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006374 assert(p > q);
6375 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006376 return NULL;
6377 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378}
6379
Alexander Belopolsky40018472011-02-26 01:02:56 +00006380PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006381PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6382 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006384 PyObject *result;
6385 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6386 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006387 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006388 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6389 Py_DECREF(tmp);
6390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391}
6392
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006393/* --- Unicode Internal Codec ------------------------------------------- */
6394
Alexander Belopolsky40018472011-02-26 01:02:56 +00006395PyObject *
6396_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006397 Py_ssize_t size,
6398 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399{
6400 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006401 Py_ssize_t startinpos;
6402 Py_ssize_t endinpos;
6403 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006404 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006405 const char *end;
6406 const char *reason;
6407 PyObject *errorHandler = NULL;
6408 PyObject *exc = NULL;
6409
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006410 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006411 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006412 1))
6413 return NULL;
6414
Thomas Wouters89f507f2006-12-13 04:49:30 +00006415 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006416 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006417 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006419 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006420 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006421 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006422 end = s + size;
6423
6424 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006425 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006426 Py_UCS4 ch;
6427 /* We copy the raw representation one byte at a time because the
6428 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006429 ((char *) &uch)[0] = s[0];
6430 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006431#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006432 ((char *) &uch)[2] = s[2];
6433 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006434#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006435 ch = uch;
6436
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006437 /* We have to sanity check the raw data, otherwise doom looms for
6438 some malformed UCS-4 data. */
6439 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006440#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006441 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006442#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006443 end-s < Py_UNICODE_SIZE
6444 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006446 startinpos = s - starts;
6447 if (end-s < Py_UNICODE_SIZE) {
6448 endinpos = end-starts;
6449 reason = "truncated input";
6450 }
6451 else {
6452 endinpos = s - starts + Py_UNICODE_SIZE;
6453 reason = "illegal code point (> 0x10FFFF)";
6454 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006455 if (unicode_decode_call_errorhandler(
6456 errors, &errorHandler,
6457 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006458 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006459 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006460 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006461 continue;
6462 }
6463
6464 s += Py_UNICODE_SIZE;
6465#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006466 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006467 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006468 Py_UNICODE uch2;
6469 ((char *) &uch2)[0] = s[0];
6470 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006471 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006472 {
Victor Stinner551ac952011-11-29 22:58:13 +01006473 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006474 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006475 }
6476 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006477#endif
6478
6479 if (unicode_putchar(&v, &outpos, ch) < 0)
6480 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006481 }
6482
Victor Stinner16e6a802011-12-12 13:24:15 +01006483 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006484 goto onError;
6485 Py_XDECREF(errorHandler);
6486 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006487 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006490 Py_XDECREF(v);
6491 Py_XDECREF(errorHandler);
6492 Py_XDECREF(exc);
6493 return NULL;
6494}
6495
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496/* --- Latin-1 Codec ------------------------------------------------------ */
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
6499PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006500 Py_ssize_t size,
6501 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006504 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505}
6506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006507/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006508static void
6509make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006510 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006511 PyObject *unicode,
6512 Py_ssize_t startpos, Py_ssize_t endpos,
6513 const char *reason)
6514{
6515 if (*exceptionObject == NULL) {
6516 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006517 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006518 encoding, unicode, startpos, endpos, reason);
6519 }
6520 else {
6521 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6522 goto onError;
6523 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6524 goto onError;
6525 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6526 goto onError;
6527 return;
6528 onError:
6529 Py_DECREF(*exceptionObject);
6530 *exceptionObject = NULL;
6531 }
6532}
6533
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006534/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006535static void
6536raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006537 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006538 PyObject *unicode,
6539 Py_ssize_t startpos, Py_ssize_t endpos,
6540 const char *reason)
6541{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006542 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006543 encoding, unicode, startpos, endpos, reason);
6544 if (*exceptionObject != NULL)
6545 PyCodec_StrictErrors(*exceptionObject);
6546}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547
6548/* error handling callback helper:
6549 build arguments, call the callback and check the arguments,
6550 put the result into newpos and return the replacement string, which
6551 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006552static PyObject *
6553unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006554 PyObject **errorHandler,
6555 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006556 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006557 Py_ssize_t startpos, Py_ssize_t endpos,
6558 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006560 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 PyObject *restuple;
6563 PyObject *resunicode;
6564
6565 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569 }
6570
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006571 if (PyUnicode_READY(unicode) < 0)
6572 return NULL;
6573 len = PyUnicode_GET_LENGTH(unicode);
6574
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006575 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579
6580 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006585 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 Py_DECREF(restuple);
6587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006589 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 &resunicode, newpos)) {
6591 Py_DECREF(restuple);
6592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006593 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006594 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6595 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6596 Py_DECREF(restuple);
6597 return NULL;
6598 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006600 *newpos = len + *newpos;
6601 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6603 Py_DECREF(restuple);
6604 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006605 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006606 Py_INCREF(resunicode);
6607 Py_DECREF(restuple);
6608 return resunicode;
6609}
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006614 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006615{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 /* input state */
6617 Py_ssize_t pos=0, size;
6618 int kind;
6619 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006620 /* output object */
6621 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 /* pointer into the output */
6623 char *str;
6624 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006625 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006626 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6627 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 PyObject *errorHandler = NULL;
6629 PyObject *exc = NULL;
6630 /* the following variable is used for caching string comparisons
6631 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6632 int known_errorHandler = -1;
6633
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006634 if (PyUnicode_READY(unicode) < 0)
6635 return NULL;
6636 size = PyUnicode_GET_LENGTH(unicode);
6637 kind = PyUnicode_KIND(unicode);
6638 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 /* allocate enough for a simple encoding without
6640 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006641 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006642 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006643 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006645 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006646 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006647 ressize = size;
6648
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 while (pos < size) {
6650 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 /* can we encode this? */
6653 if (c<limit) {
6654 /* no overflow check, because we know that the space is enough */
6655 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006657 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006658 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 Py_ssize_t requiredsize;
6660 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006661 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006663 Py_ssize_t collstart = pos;
6664 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 ++collend;
6668 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6669 if (known_errorHandler==-1) {
6670 if ((errors==NULL) || (!strcmp(errors, "strict")))
6671 known_errorHandler = 1;
6672 else if (!strcmp(errors, "replace"))
6673 known_errorHandler = 2;
6674 else if (!strcmp(errors, "ignore"))
6675 known_errorHandler = 3;
6676 else if (!strcmp(errors, "xmlcharrefreplace"))
6677 known_errorHandler = 4;
6678 else
6679 known_errorHandler = 0;
6680 }
6681 switch (known_errorHandler) {
6682 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006683 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 goto onError;
6685 case 2: /* replace */
6686 while (collstart++<collend)
6687 *str++ = '?'; /* fall through */
6688 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006689 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 break;
6691 case 4: /* xmlcharrefreplace */
6692 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006693 /* determine replacement size */
6694 for (i = collstart, repsize = 0; i < collend; ++i) {
6695 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6696 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006698 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006700 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006702 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006706 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006708 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006709 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006711 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 if (requiredsize > ressize) {
6715 if (requiredsize<2*ressize)
6716 requiredsize = 2*ressize;
6717 if (_PyBytes_Resize(&res, requiredsize))
6718 goto onError;
6719 str = PyBytes_AS_STRING(res) + respos;
6720 ressize = requiredsize;
6721 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 /* generate replacement */
6723 for (i = collstart; i < collend; ++i) {
6724 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 break;
6728 default:
6729 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 encoding, reason, unicode, &exc,
6731 collstart, collend, &newpos);
6732 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6733 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006735 if (PyBytes_Check(repunicode)) {
6736 /* Directly copy bytes result to output. */
6737 repsize = PyBytes_Size(repunicode);
6738 if (repsize > 1) {
6739 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006740 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006741 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6742 Py_DECREF(repunicode);
6743 goto onError;
6744 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006745 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006746 ressize += repsize-1;
6747 }
6748 memcpy(str, PyBytes_AsString(repunicode), repsize);
6749 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006751 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006752 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006753 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 /* need more space? (at least enough for what we
6755 have+the replacement+the rest of the string, so
6756 we won't have to check space for encodable characters) */
6757 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 repsize = PyUnicode_GET_LENGTH(repunicode);
6759 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 if (requiredsize > ressize) {
6761 if (requiredsize<2*ressize)
6762 requiredsize = 2*ressize;
6763 if (_PyBytes_Resize(&res, requiredsize)) {
6764 Py_DECREF(repunicode);
6765 goto onError;
6766 }
6767 str = PyBytes_AS_STRING(res) + respos;
6768 ressize = requiredsize;
6769 }
6770 /* check if there is anything unencodable in the replacement
6771 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772 for (i = 0; repsize-->0; ++i, ++str) {
6773 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006775 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006776 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 Py_DECREF(repunicode);
6778 goto onError;
6779 }
6780 *str = (char)c;
6781 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006783 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006785 }
6786 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006787 /* Resize if we allocated to much */
6788 size = str - PyBytes_AS_STRING(res);
6789 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006790 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006791 if (_PyBytes_Resize(&res, size) < 0)
6792 goto onError;
6793 }
6794
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 Py_XDECREF(errorHandler);
6796 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006797 return res;
6798
6799 onError:
6800 Py_XDECREF(res);
6801 Py_XDECREF(errorHandler);
6802 Py_XDECREF(exc);
6803 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804}
6805
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807PyObject *
6808PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006809 Py_ssize_t size,
6810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 PyObject *result;
6813 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6814 if (unicode == NULL)
6815 return NULL;
6816 result = unicode_encode_ucs1(unicode, errors, 256);
6817 Py_DECREF(unicode);
6818 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819}
6820
Alexander Belopolsky40018472011-02-26 01:02:56 +00006821PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006822_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823{
6824 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 PyErr_BadArgument();
6826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828 if (PyUnicode_READY(unicode) == -1)
6829 return NULL;
6830 /* Fast path: if it is a one-byte string, construct
6831 bytes object directly. */
6832 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6833 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6834 PyUnicode_GET_LENGTH(unicode));
6835 /* Non-Latin-1 characters present. Defer to above function to
6836 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006838}
6839
6840PyObject*
6841PyUnicode_AsLatin1String(PyObject *unicode)
6842{
6843 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844}
6845
6846/* --- 7-bit ASCII Codec -------------------------------------------------- */
6847
Alexander Belopolsky40018472011-02-26 01:02:56 +00006848PyObject *
6849PyUnicode_DecodeASCII(const char *s,
6850 Py_ssize_t size,
6851 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006854 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006855 int kind;
6856 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006857 Py_ssize_t startinpos;
6858 Py_ssize_t endinpos;
6859 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006860 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006861 int has_error;
6862 const unsigned char *p = (const unsigned char *)s;
6863 const unsigned char *end = p + size;
6864 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006865 PyObject *errorHandler = NULL;
6866 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006867
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006868 if (size == 0) {
6869 Py_INCREF(unicode_empty);
6870 return unicode_empty;
6871 }
6872
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006874 if (size == 1 && (unsigned char)s[0] < 128)
6875 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006876
Victor Stinner702c7342011-10-05 13:50:52 +02006877 has_error = 0;
6878 while (p < end && !has_error) {
6879 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6880 an explanation. */
6881 if (!((size_t) p & LONG_PTR_MASK)) {
6882 /* Help register allocation */
6883 register const unsigned char *_p = p;
6884 while (_p < aligned_end) {
6885 unsigned long value = *(unsigned long *) _p;
6886 if (value & ASCII_CHAR_MASK) {
6887 has_error = 1;
6888 break;
6889 }
6890 _p += SIZEOF_LONG;
6891 }
6892 if (_p == end)
6893 break;
6894 if (has_error)
6895 break;
6896 p = _p;
6897 }
6898 if (*p & 0x80) {
6899 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006900 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006901 }
6902 else {
6903 ++p;
6904 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006905 }
Victor Stinner702c7342011-10-05 13:50:52 +02006906 if (!has_error)
6907 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006908
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006909 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006913 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006914 kind = PyUnicode_KIND(v);
6915 data = PyUnicode_DATA(v);
6916 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917 e = s + size;
6918 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 register unsigned char c = (unsigned char)*s;
6920 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006921 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 ++s;
6923 }
6924 else {
6925 startinpos = s-starts;
6926 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 if (unicode_decode_call_errorhandler(
6928 errors, &errorHandler,
6929 "ascii", "ordinal not in range(128)",
6930 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006931 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006933 kind = PyUnicode_KIND(v);
6934 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006937 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006938 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006939 Py_XDECREF(errorHandler);
6940 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006941 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006942 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006943
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006946 Py_XDECREF(errorHandler);
6947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 return NULL;
6949}
6950
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006951/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006952PyObject *
6953PyUnicode_EncodeASCII(const Py_UNICODE *p,
6954 Py_ssize_t size,
6955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006957 PyObject *result;
6958 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6959 if (unicode == NULL)
6960 return NULL;
6961 result = unicode_encode_ucs1(unicode, errors, 128);
6962 Py_DECREF(unicode);
6963 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964}
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006967_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968{
6969 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006970 PyErr_BadArgument();
6971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006973 if (PyUnicode_READY(unicode) == -1)
6974 return NULL;
6975 /* Fast path: if it is an ASCII-only string, construct bytes object
6976 directly. Else defer to above function to raise the exception. */
6977 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6978 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6979 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006980 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006981}
6982
6983PyObject *
6984PyUnicode_AsASCIIString(PyObject *unicode)
6985{
6986 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006987}
6988
Victor Stinner99b95382011-07-04 14:23:54 +02006989#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006990
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006991/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006992
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006993#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994#define NEED_RETRY
6995#endif
6996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997#ifndef WC_ERR_INVALID_CHARS
6998# define WC_ERR_INVALID_CHARS 0x0080
6999#endif
7000
7001static char*
7002code_page_name(UINT code_page, PyObject **obj)
7003{
7004 *obj = NULL;
7005 if (code_page == CP_ACP)
7006 return "mbcs";
7007 if (code_page == CP_UTF7)
7008 return "CP_UTF7";
7009 if (code_page == CP_UTF8)
7010 return "CP_UTF8";
7011
7012 *obj = PyBytes_FromFormat("cp%u", code_page);
7013 if (*obj == NULL)
7014 return NULL;
7015 return PyBytes_AS_STRING(*obj);
7016}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017
Alexander Belopolsky40018472011-02-26 01:02:56 +00007018static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007019is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020{
7021 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007022 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
Victor Stinner3a50e702011-10-18 21:21:00 +02007024 if (!IsDBCSLeadByteEx(code_page, *curr))
7025 return 0;
7026
7027 prev = CharPrevExA(code_page, s, curr, 0);
7028 if (prev == curr)
7029 return 1;
7030 /* FIXME: This code is limited to "true" double-byte encodings,
7031 as it assumes an incomplete character consists of a single
7032 byte. */
7033 if (curr - prev == 2)
7034 return 1;
7035 if (!IsDBCSLeadByteEx(code_page, *prev))
7036 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 return 0;
7038}
7039
Victor Stinner3a50e702011-10-18 21:21:00 +02007040static DWORD
7041decode_code_page_flags(UINT code_page)
7042{
7043 if (code_page == CP_UTF7) {
7044 /* The CP_UTF7 decoder only supports flags=0 */
7045 return 0;
7046 }
7047 else
7048 return MB_ERR_INVALID_CHARS;
7049}
7050
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 * Decode a byte string from a Windows code page into unicode object in strict
7053 * mode.
7054 *
7055 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7056 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007058static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007059decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 const char *in,
7062 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063{
Victor Stinner3a50e702011-10-18 21:21:00 +02007064 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007065 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067
7068 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 assert(insize > 0);
7070 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7071 if (outsize <= 0)
7072 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
7074 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007076 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007077 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 if (*v == NULL)
7079 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081 }
7082 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007085 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 }
7089
7090 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007091 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7092 if (outsize <= 0)
7093 goto error;
7094 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007095
Victor Stinner3a50e702011-10-18 21:21:00 +02007096error:
7097 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7098 return -2;
7099 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007100 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101}
7102
Victor Stinner3a50e702011-10-18 21:21:00 +02007103/*
7104 * Decode a byte string from a code page into unicode object with an error
7105 * handler.
7106 *
7107 * Returns consumed size if succeed, or raise a WindowsError or
7108 * UnicodeDecodeError exception and returns -1 on error.
7109 */
7110static int
7111decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007112 PyObject **v,
7113 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 const char *errors)
7115{
7116 const char *startin = in;
7117 const char *endin = in + size;
7118 const DWORD flags = decode_code_page_flags(code_page);
7119 /* Ideally, we should get reason from FormatMessage. This is the Windows
7120 2000 English version of the message. */
7121 const char *reason = "No mapping for the Unicode character exists "
7122 "in the target code page.";
7123 /* each step cannot decode more than 1 character, but a character can be
7124 represented as a surrogate pair */
7125 wchar_t buffer[2], *startout, *out;
7126 int insize, outsize;
7127 PyObject *errorHandler = NULL;
7128 PyObject *exc = NULL;
7129 PyObject *encoding_obj = NULL;
7130 char *encoding;
7131 DWORD err;
7132 int ret = -1;
7133
7134 assert(size > 0);
7135
7136 encoding = code_page_name(code_page, &encoding_obj);
7137 if (encoding == NULL)
7138 return -1;
7139
7140 if (errors == NULL || strcmp(errors, "strict") == 0) {
7141 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7142 UnicodeDecodeError. */
7143 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7144 if (exc != NULL) {
7145 PyCodec_StrictErrors(exc);
7146 Py_CLEAR(exc);
7147 }
7148 goto error;
7149 }
7150
7151 if (*v == NULL) {
7152 /* Create unicode object */
7153 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7154 PyErr_NoMemory();
7155 goto error;
7156 }
Victor Stinnerab595942011-12-17 04:59:06 +01007157 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007158 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 if (*v == NULL)
7160 goto error;
7161 startout = PyUnicode_AS_UNICODE(*v);
7162 }
7163 else {
7164 /* Extend unicode object */
7165 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7166 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7167 PyErr_NoMemory();
7168 goto error;
7169 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007170 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 goto error;
7172 startout = PyUnicode_AS_UNICODE(*v) + n;
7173 }
7174
7175 /* Decode the byte string character per character */
7176 out = startout;
7177 while (in < endin)
7178 {
7179 /* Decode a character */
7180 insize = 1;
7181 do
7182 {
7183 outsize = MultiByteToWideChar(code_page, flags,
7184 in, insize,
7185 buffer, Py_ARRAY_LENGTH(buffer));
7186 if (outsize > 0)
7187 break;
7188 err = GetLastError();
7189 if (err != ERROR_NO_UNICODE_TRANSLATION
7190 && err != ERROR_INSUFFICIENT_BUFFER)
7191 {
7192 PyErr_SetFromWindowsErr(0);
7193 goto error;
7194 }
7195 insize++;
7196 }
7197 /* 4=maximum length of a UTF-8 sequence */
7198 while (insize <= 4 && (in + insize) <= endin);
7199
7200 if (outsize <= 0) {
7201 Py_ssize_t startinpos, endinpos, outpos;
7202
7203 startinpos = in - startin;
7204 endinpos = startinpos + 1;
7205 outpos = out - PyUnicode_AS_UNICODE(*v);
7206 if (unicode_decode_call_errorhandler(
7207 errors, &errorHandler,
7208 encoding, reason,
7209 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007210 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 {
7212 goto error;
7213 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007214 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 }
7216 else {
7217 in += insize;
7218 memcpy(out, buffer, outsize * sizeof(wchar_t));
7219 out += outsize;
7220 }
7221 }
7222
7223 /* write a NUL character at the end */
7224 *out = 0;
7225
7226 /* Extend unicode object */
7227 outsize = out - startout;
7228 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007229 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007231 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007232
7233error:
7234 Py_XDECREF(encoding_obj);
7235 Py_XDECREF(errorHandler);
7236 Py_XDECREF(exc);
7237 return ret;
7238}
7239
Victor Stinner3a50e702011-10-18 21:21:00 +02007240static PyObject *
7241decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007242 const char *s, Py_ssize_t size,
7243 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007244{
Victor Stinner76a31a62011-11-04 00:05:13 +01007245 PyObject *v = NULL;
7246 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007247
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 if (code_page < 0) {
7249 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7250 return NULL;
7251 }
7252
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007253 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255
Victor Stinner76a31a62011-11-04 00:05:13 +01007256 do
7257 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007258#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007259 if (size > INT_MAX) {
7260 chunk_size = INT_MAX;
7261 final = 0;
7262 done = 0;
7263 }
7264 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007266 {
7267 chunk_size = (int)size;
7268 final = (consumed == NULL);
7269 done = 1;
7270 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271
Victor Stinner76a31a62011-11-04 00:05:13 +01007272 /* Skip trailing lead-byte unless 'final' is set */
7273 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7274 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275
Victor Stinner76a31a62011-11-04 00:05:13 +01007276 if (chunk_size == 0 && done) {
7277 if (v != NULL)
7278 break;
7279 Py_INCREF(unicode_empty);
7280 return unicode_empty;
7281 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282
Victor Stinner76a31a62011-11-04 00:05:13 +01007283
7284 converted = decode_code_page_strict(code_page, &v,
7285 s, chunk_size);
7286 if (converted == -2)
7287 converted = decode_code_page_errors(code_page, &v,
7288 s, chunk_size,
7289 errors);
7290 assert(converted != 0);
7291
7292 if (converted < 0) {
7293 Py_XDECREF(v);
7294 return NULL;
7295 }
7296
7297 if (consumed)
7298 *consumed += converted;
7299
7300 s += converted;
7301 size -= converted;
7302 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007303
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007304 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305}
7306
Alexander Belopolsky40018472011-02-26 01:02:56 +00007307PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007308PyUnicode_DecodeCodePageStateful(int code_page,
7309 const char *s,
7310 Py_ssize_t size,
7311 const char *errors,
7312 Py_ssize_t *consumed)
7313{
7314 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7315}
7316
7317PyObject *
7318PyUnicode_DecodeMBCSStateful(const char *s,
7319 Py_ssize_t size,
7320 const char *errors,
7321 Py_ssize_t *consumed)
7322{
7323 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7324}
7325
7326PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007327PyUnicode_DecodeMBCS(const char *s,
7328 Py_ssize_t size,
7329 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007330{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007331 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7332}
7333
Victor Stinner3a50e702011-10-18 21:21:00 +02007334static DWORD
7335encode_code_page_flags(UINT code_page, const char *errors)
7336{
7337 if (code_page == CP_UTF8) {
7338 if (winver.dwMajorVersion >= 6)
7339 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7340 and later */
7341 return WC_ERR_INVALID_CHARS;
7342 else
7343 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7344 return 0;
7345 }
7346 else if (code_page == CP_UTF7) {
7347 /* CP_UTF7 only supports flags=0 */
7348 return 0;
7349 }
7350 else {
7351 if (errors != NULL && strcmp(errors, "replace") == 0)
7352 return 0;
7353 else
7354 return WC_NO_BEST_FIT_CHARS;
7355 }
7356}
7357
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 * Encode a Unicode string to a Windows code page into a byte string in strict
7360 * mode.
7361 *
7362 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7363 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007365static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007366encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007367 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369{
Victor Stinner554f3f02010-06-16 23:33:54 +00007370 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 BOOL *pusedDefaultChar = &usedDefaultChar;
7372 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007373 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007374 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007376 const DWORD flags = encode_code_page_flags(code_page, NULL);
7377 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007378 /* Create a substring so that we can get the UTF-16 representation
7379 of just the slice under consideration. */
7380 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381
Martin v. Löwis3d325192011-11-04 18:23:06 +01007382 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007383
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007385 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007386 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007387 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007388
Victor Stinner2fc507f2011-11-04 20:06:39 +01007389 substring = PyUnicode_Substring(unicode, offset, offset+len);
7390 if (substring == NULL)
7391 return -1;
7392 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7393 if (p == NULL) {
7394 Py_DECREF(substring);
7395 return -1;
7396 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007398 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 outsize = WideCharToMultiByte(code_page, flags,
7400 p, size,
7401 NULL, 0,
7402 NULL, pusedDefaultChar);
7403 if (outsize <= 0)
7404 goto error;
7405 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007406 if (pusedDefaultChar && *pusedDefaultChar) {
7407 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007408 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007413 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007414 if (*outbytes == NULL) {
7415 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007417 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419 }
7420 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 const Py_ssize_t n = PyBytes_Size(*outbytes);
7423 if (outsize > PY_SSIZE_T_MAX - n) {
7424 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007425 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7429 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007431 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007432 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007433 }
7434
7435 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007436 outsize = WideCharToMultiByte(code_page, flags,
7437 p, size,
7438 out, outsize,
7439 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 if (outsize <= 0)
7442 goto error;
7443 if (pusedDefaultChar && *pusedDefaultChar)
7444 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007445 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007448 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007449 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7450 return -2;
7451 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007452 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007453}
7454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455/*
7456 * Encode a Unicode string to a Windows code page into a byte string using a
7457 * error handler.
7458 *
7459 * Returns consumed characters if succeed, or raise a WindowsError and returns
7460 * -1 on other error.
7461 */
7462static int
7463encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007464 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466{
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_ssize_t pos = unicode_offset;
7469 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 /* Ideally, we should get reason from FormatMessage. This is the Windows
7471 2000 English version of the message. */
7472 const char *reason = "invalid character";
7473 /* 4=maximum length of a UTF-8 sequence */
7474 char buffer[4];
7475 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7476 Py_ssize_t outsize;
7477 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 PyObject *errorHandler = NULL;
7479 PyObject *exc = NULL;
7480 PyObject *encoding_obj = NULL;
7481 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007482 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 PyObject *rep;
7484 int ret = -1;
7485
7486 assert(insize > 0);
7487
7488 encoding = code_page_name(code_page, &encoding_obj);
7489 if (encoding == NULL)
7490 return -1;
7491
7492 if (errors == NULL || strcmp(errors, "strict") == 0) {
7493 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7494 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007495 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 if (exc != NULL) {
7497 PyCodec_StrictErrors(exc);
7498 Py_DECREF(exc);
7499 }
7500 Py_XDECREF(encoding_obj);
7501 return -1;
7502 }
7503
7504 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7505 pusedDefaultChar = &usedDefaultChar;
7506 else
7507 pusedDefaultChar = NULL;
7508
7509 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7510 PyErr_NoMemory();
7511 goto error;
7512 }
7513 outsize = insize * Py_ARRAY_LENGTH(buffer);
7514
7515 if (*outbytes == NULL) {
7516 /* Create string object */
7517 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7518 if (*outbytes == NULL)
7519 goto error;
7520 out = PyBytes_AS_STRING(*outbytes);
7521 }
7522 else {
7523 /* Extend string object */
7524 Py_ssize_t n = PyBytes_Size(*outbytes);
7525 if (n > PY_SSIZE_T_MAX - outsize) {
7526 PyErr_NoMemory();
7527 goto error;
7528 }
7529 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7530 goto error;
7531 out = PyBytes_AS_STRING(*outbytes) + n;
7532 }
7533
7534 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007537 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7538 wchar_t chars[2];
7539 int charsize;
7540 if (ch < 0x10000) {
7541 chars[0] = (wchar_t)ch;
7542 charsize = 1;
7543 }
7544 else {
7545 ch -= 0x10000;
7546 chars[0] = 0xd800 + (ch >> 10);
7547 chars[1] = 0xdc00 + (ch & 0x3ff);
7548 charsize = 2;
7549 }
7550
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007552 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 buffer, Py_ARRAY_LENGTH(buffer),
7554 NULL, pusedDefaultChar);
7555 if (outsize > 0) {
7556 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7557 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007558 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007559 memcpy(out, buffer, outsize);
7560 out += outsize;
7561 continue;
7562 }
7563 }
7564 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7565 PyErr_SetFromWindowsErr(0);
7566 goto error;
7567 }
7568
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 rep = unicode_encode_call_errorhandler(
7570 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007571 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007572 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 if (rep == NULL)
7574 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007575 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007576
7577 if (PyBytes_Check(rep)) {
7578 outsize = PyBytes_GET_SIZE(rep);
7579 if (outsize != 1) {
7580 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7581 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7582 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7583 Py_DECREF(rep);
7584 goto error;
7585 }
7586 out = PyBytes_AS_STRING(*outbytes) + offset;
7587 }
7588 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7589 out += outsize;
7590 }
7591 else {
7592 Py_ssize_t i;
7593 enum PyUnicode_Kind kind;
7594 void *data;
7595
7596 if (PyUnicode_READY(rep) < 0) {
7597 Py_DECREF(rep);
7598 goto error;
7599 }
7600
7601 outsize = PyUnicode_GET_LENGTH(rep);
7602 if (outsize != 1) {
7603 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7604 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7605 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7606 Py_DECREF(rep);
7607 goto error;
7608 }
7609 out = PyBytes_AS_STRING(*outbytes) + offset;
7610 }
7611 kind = PyUnicode_KIND(rep);
7612 data = PyUnicode_DATA(rep);
7613 for (i=0; i < outsize; i++) {
7614 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7615 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007616 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007617 encoding, unicode,
7618 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007619 "unable to encode error handler result to ASCII");
7620 Py_DECREF(rep);
7621 goto error;
7622 }
7623 *out = (unsigned char)ch;
7624 out++;
7625 }
7626 }
7627 Py_DECREF(rep);
7628 }
7629 /* write a NUL byte */
7630 *out = 0;
7631 outsize = out - PyBytes_AS_STRING(*outbytes);
7632 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7633 if (_PyBytes_Resize(outbytes, outsize) < 0)
7634 goto error;
7635 ret = 0;
7636
7637error:
7638 Py_XDECREF(encoding_obj);
7639 Py_XDECREF(errorHandler);
7640 Py_XDECREF(exc);
7641 return ret;
7642}
7643
Victor Stinner3a50e702011-10-18 21:21:00 +02007644static PyObject *
7645encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007646 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 const char *errors)
7648{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007649 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007651 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007652 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007653
Victor Stinner2fc507f2011-11-04 20:06:39 +01007654 if (PyUnicode_READY(unicode) < 0)
7655 return NULL;
7656 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007657
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 if (code_page < 0) {
7659 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7660 return NULL;
7661 }
7662
Martin v. Löwis3d325192011-11-04 18:23:06 +01007663 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007664 return PyBytes_FromStringAndSize(NULL, 0);
7665
Victor Stinner7581cef2011-11-03 22:32:33 +01007666 offset = 0;
7667 do
7668 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007669#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007670 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007671 chunks. */
7672 if (len > INT_MAX/2) {
7673 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007674 done = 0;
7675 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007676 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007677#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007678 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007679 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007680 done = 1;
7681 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007682
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007684 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007685 errors);
7686 if (ret == -2)
7687 ret = encode_code_page_errors(code_page, &outbytes,
7688 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007689 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007690 if (ret < 0) {
7691 Py_XDECREF(outbytes);
7692 return NULL;
7693 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007694
Victor Stinner7581cef2011-11-03 22:32:33 +01007695 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007696 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007697 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698
Victor Stinner3a50e702011-10-18 21:21:00 +02007699 return outbytes;
7700}
7701
7702PyObject *
7703PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7704 Py_ssize_t size,
7705 const char *errors)
7706{
Victor Stinner7581cef2011-11-03 22:32:33 +01007707 PyObject *unicode, *res;
7708 unicode = PyUnicode_FromUnicode(p, size);
7709 if (unicode == NULL)
7710 return NULL;
7711 res = encode_code_page(CP_ACP, unicode, errors);
7712 Py_DECREF(unicode);
7713 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007714}
7715
7716PyObject *
7717PyUnicode_EncodeCodePage(int code_page,
7718 PyObject *unicode,
7719 const char *errors)
7720{
Victor Stinner7581cef2011-11-03 22:32:33 +01007721 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007722}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007723
Alexander Belopolsky40018472011-02-26 01:02:56 +00007724PyObject *
7725PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007726{
7727 if (!PyUnicode_Check(unicode)) {
7728 PyErr_BadArgument();
7729 return NULL;
7730 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007731 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007732}
7733
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007734#undef NEED_RETRY
7735
Victor Stinner99b95382011-07-04 14:23:54 +02007736#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007737
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738/* --- Character Mapping Codec -------------------------------------------- */
7739
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740PyObject *
7741PyUnicode_DecodeCharmap(const char *s,
7742 Py_ssize_t size,
7743 PyObject *mapping,
7744 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007747 Py_ssize_t startinpos;
7748 Py_ssize_t endinpos;
7749 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007751 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007752 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 PyObject *errorHandler = NULL;
7754 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 /* Default to Latin-1 */
7757 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007760 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007764 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007765 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007766 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007767 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007768 Py_ssize_t maplen;
7769 enum PyUnicode_Kind kind;
7770 void *data;
7771 Py_UCS4 x;
7772
7773 if (PyUnicode_READY(mapping) < 0)
7774 return NULL;
7775
7776 maplen = PyUnicode_GET_LENGTH(mapping);
7777 data = PyUnicode_DATA(mapping);
7778 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 while (s < e) {
7780 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007783 x = PyUnicode_READ(kind, data, ch);
7784 else
7785 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007787 if (x == 0xfffe)
7788 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 startinpos = s-starts;
7791 endinpos = startinpos+1;
7792 if (unicode_decode_call_errorhandler(
7793 errors, &errorHandler,
7794 "charmap", "character maps to <undefined>",
7795 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007796 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 goto onError;
7798 }
7799 continue;
7800 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007801
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007802 if (unicode_putchar(&v, &outpos, x) < 0)
7803 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007806 }
7807 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 while (s < e) {
7809 unsigned char ch = *s;
7810 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007811
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7813 w = PyLong_FromLong((long)ch);
7814 if (w == NULL)
7815 goto onError;
7816 x = PyObject_GetItem(mapping, w);
7817 Py_DECREF(w);
7818 if (x == NULL) {
7819 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7820 /* No mapping found means: mapping is undefined. */
7821 PyErr_Clear();
7822 x = Py_None;
7823 Py_INCREF(x);
7824 } else
7825 goto onError;
7826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007827
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 /* Apply mapping */
7829 if (PyLong_Check(x)) {
7830 long value = PyLong_AS_LONG(x);
7831 if (value < 0 || value > 65535) {
7832 PyErr_SetString(PyExc_TypeError,
7833 "character mapping must be in range(65536)");
7834 Py_DECREF(x);
7835 goto onError;
7836 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007837 if (unicode_putchar(&v, &outpos, value) < 0)
7838 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 }
7840 else if (x == Py_None) {
7841 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 startinpos = s-starts;
7843 endinpos = startinpos+1;
7844 if (unicode_decode_call_errorhandler(
7845 errors, &errorHandler,
7846 "charmap", "character maps to <undefined>",
7847 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007848 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 Py_DECREF(x);
7850 goto onError;
7851 }
7852 Py_DECREF(x);
7853 continue;
7854 }
7855 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007856 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007858 if (PyUnicode_READY(x) < 0)
7859 goto onError;
7860 targetsize = PyUnicode_GET_LENGTH(x);
7861
7862 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007864 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007865 PyUnicode_READ_CHAR(x, 0)) < 0)
7866 goto onError;
7867 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 else if (targetsize > 1) {
7869 /* 1-n mapping */
7870 if (targetsize > extrachars) {
7871 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 Py_ssize_t needed = (targetsize - extrachars) + \
7873 (targetsize << 2);
7874 extrachars += needed;
7875 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007876 if (unicode_resize(&v,
7877 PyUnicode_GET_LENGTH(v) + needed) < 0)
7878 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 Py_DECREF(x);
7880 goto onError;
7881 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007883 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7884 goto onError;
7885 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7886 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 extrachars -= targetsize;
7888 }
7889 /* 1-0 mapping: skip the character */
7890 }
7891 else {
7892 /* wrong return value */
7893 PyErr_SetString(PyExc_TypeError,
7894 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 Py_DECREF(x);
7896 goto onError;
7897 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 Py_DECREF(x);
7899 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007902 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007903 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 Py_XDECREF(errorHandler);
7905 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007906 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007907
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007909 Py_XDECREF(errorHandler);
7910 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 Py_XDECREF(v);
7912 return NULL;
7913}
7914
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007915/* Charmap encoding: the lookup table */
7916
Alexander Belopolsky40018472011-02-26 01:02:56 +00007917struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 PyObject_HEAD
7919 unsigned char level1[32];
7920 int count2, count3;
7921 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922};
7923
7924static PyObject*
7925encoding_map_size(PyObject *obj, PyObject* args)
7926{
7927 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007928 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930}
7931
7932static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 PyDoc_STR("Return the size (in bytes) of this object") },
7935 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936};
7937
7938static void
7939encoding_map_dealloc(PyObject* o)
7940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007941 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942}
7943
7944static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007945 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 "EncodingMap", /*tp_name*/
7947 sizeof(struct encoding_map), /*tp_basicsize*/
7948 0, /*tp_itemsize*/
7949 /* methods */
7950 encoding_map_dealloc, /*tp_dealloc*/
7951 0, /*tp_print*/
7952 0, /*tp_getattr*/
7953 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007954 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 0, /*tp_repr*/
7956 0, /*tp_as_number*/
7957 0, /*tp_as_sequence*/
7958 0, /*tp_as_mapping*/
7959 0, /*tp_hash*/
7960 0, /*tp_call*/
7961 0, /*tp_str*/
7962 0, /*tp_getattro*/
7963 0, /*tp_setattro*/
7964 0, /*tp_as_buffer*/
7965 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7966 0, /*tp_doc*/
7967 0, /*tp_traverse*/
7968 0, /*tp_clear*/
7969 0, /*tp_richcompare*/
7970 0, /*tp_weaklistoffset*/
7971 0, /*tp_iter*/
7972 0, /*tp_iternext*/
7973 encoding_map_methods, /*tp_methods*/
7974 0, /*tp_members*/
7975 0, /*tp_getset*/
7976 0, /*tp_base*/
7977 0, /*tp_dict*/
7978 0, /*tp_descr_get*/
7979 0, /*tp_descr_set*/
7980 0, /*tp_dictoffset*/
7981 0, /*tp_init*/
7982 0, /*tp_alloc*/
7983 0, /*tp_new*/
7984 0, /*tp_free*/
7985 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986};
7987
7988PyObject*
7989PyUnicode_BuildEncodingMap(PyObject* string)
7990{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007991 PyObject *result;
7992 struct encoding_map *mresult;
7993 int i;
7994 int need_dict = 0;
7995 unsigned char level1[32];
7996 unsigned char level2[512];
7997 unsigned char *mlevel1, *mlevel2, *mlevel3;
7998 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007999 int kind;
8000 void *data;
8001 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004 PyErr_BadArgument();
8005 return NULL;
8006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008007 kind = PyUnicode_KIND(string);
8008 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 memset(level1, 0xFF, sizeof level1);
8010 memset(level2, 0xFF, sizeof level2);
8011
8012 /* If there isn't a one-to-one mapping of NULL to \0,
8013 or if there are non-BMP characters, we need to use
8014 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008015 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016 need_dict = 1;
8017 for (i = 1; i < 256; i++) {
8018 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008019 ch = PyUnicode_READ(kind, data, i);
8020 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008021 need_dict = 1;
8022 break;
8023 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008024 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 /* unmapped character */
8026 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 l1 = ch >> 11;
8028 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008029 if (level1[l1] == 0xFF)
8030 level1[l1] = count2++;
8031 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008032 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008033 }
8034
8035 if (count2 >= 0xFF || count3 >= 0xFF)
8036 need_dict = 1;
8037
8038 if (need_dict) {
8039 PyObject *result = PyDict_New();
8040 PyObject *key, *value;
8041 if (!result)
8042 return NULL;
8043 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008045 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 if (!key || !value)
8047 goto failed1;
8048 if (PyDict_SetItem(result, key, value) == -1)
8049 goto failed1;
8050 Py_DECREF(key);
8051 Py_DECREF(value);
8052 }
8053 return result;
8054 failed1:
8055 Py_XDECREF(key);
8056 Py_XDECREF(value);
8057 Py_DECREF(result);
8058 return NULL;
8059 }
8060
8061 /* Create a three-level trie */
8062 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8063 16*count2 + 128*count3 - 1);
8064 if (!result)
8065 return PyErr_NoMemory();
8066 PyObject_Init(result, &EncodingMapType);
8067 mresult = (struct encoding_map*)result;
8068 mresult->count2 = count2;
8069 mresult->count3 = count3;
8070 mlevel1 = mresult->level1;
8071 mlevel2 = mresult->level23;
8072 mlevel3 = mresult->level23 + 16*count2;
8073 memcpy(mlevel1, level1, 32);
8074 memset(mlevel2, 0xFF, 16*count2);
8075 memset(mlevel3, 0, 128*count3);
8076 count3 = 0;
8077 for (i = 1; i < 256; i++) {
8078 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080 /* unmapped character */
8081 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008082 o1 = PyUnicode_READ(kind, data, i)>>11;
8083 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 i2 = 16*mlevel1[o1] + o2;
8085 if (mlevel2[i2] == 0xFF)
8086 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008087 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008088 i3 = 128*mlevel2[i2] + o3;
8089 mlevel3[i3] = i;
8090 }
8091 return result;
8092}
8093
8094static int
Victor Stinner22168992011-11-20 17:09:18 +01008095encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096{
8097 struct encoding_map *map = (struct encoding_map*)mapping;
8098 int l1 = c>>11;
8099 int l2 = (c>>7) & 0xF;
8100 int l3 = c & 0x7F;
8101 int i;
8102
Victor Stinner22168992011-11-20 17:09:18 +01008103 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 if (c == 0)
8106 return 0;
8107 /* level 1*/
8108 i = map->level1[l1];
8109 if (i == 0xFF) {
8110 return -1;
8111 }
8112 /* level 2*/
8113 i = map->level23[16*i+l2];
8114 if (i == 0xFF) {
8115 return -1;
8116 }
8117 /* level 3 */
8118 i = map->level23[16*map->count2 + 128*i + l3];
8119 if (i == 0) {
8120 return -1;
8121 }
8122 return i;
8123}
8124
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125/* Lookup the character ch in the mapping. If the character
8126 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008127 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008128static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008129charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130{
Christian Heimes217cfd12007-12-02 14:31:20 +00008131 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 PyObject *x;
8133
8134 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 x = PyObject_GetItem(mapping, w);
8137 Py_DECREF(w);
8138 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8140 /* No mapping found means: mapping is undefined. */
8141 PyErr_Clear();
8142 x = Py_None;
8143 Py_INCREF(x);
8144 return x;
8145 } else
8146 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008148 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008150 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 long value = PyLong_AS_LONG(x);
8152 if (value < 0 || value > 255) {
8153 PyErr_SetString(PyExc_TypeError,
8154 "character mapping must be in range(256)");
8155 Py_DECREF(x);
8156 return NULL;
8157 }
8158 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008160 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 /* wrong return value */
8164 PyErr_Format(PyExc_TypeError,
8165 "character mapping must return integer, bytes or None, not %.400s",
8166 x->ob_type->tp_name);
8167 Py_DECREF(x);
8168 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 }
8170}
8171
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008173charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8176 /* exponentially overallocate to minimize reallocations */
8177 if (requiredsize < 2*outsize)
8178 requiredsize = 2*outsize;
8179 if (_PyBytes_Resize(outobj, requiredsize))
8180 return -1;
8181 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182}
8183
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008186} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008187/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008188 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 space is available. Return a new reference to the object that
8190 was put in the output buffer, or Py_None, if the mapping was undefined
8191 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008192 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008194charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 PyObject *rep;
8198 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008199 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008200
Christian Heimes90aa7642007-12-19 02:45:37 +00008201 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 if (res == -1)
8205 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 if (outsize<requiredsize)
8207 if (charmapencode_resize(outobj, outpos, requiredsize))
8208 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008209 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 outstart[(*outpos)++] = (char)res;
8211 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212 }
8213
8214 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008215 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008217 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 Py_DECREF(rep);
8219 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008220 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 if (PyLong_Check(rep)) {
8222 Py_ssize_t requiredsize = *outpos+1;
8223 if (outsize<requiredsize)
8224 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8225 Py_DECREF(rep);
8226 return enc_EXCEPTION;
8227 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008228 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008229 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 else {
8232 const char *repchars = PyBytes_AS_STRING(rep);
8233 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8234 Py_ssize_t requiredsize = *outpos+repsize;
8235 if (outsize<requiredsize)
8236 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8237 Py_DECREF(rep);
8238 return enc_EXCEPTION;
8239 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008240 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 memcpy(outstart + *outpos, repchars, repsize);
8242 *outpos += repsize;
8243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 Py_DECREF(rep);
8246 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247}
8248
8249/* handle an error in PyUnicode_EncodeCharmap
8250 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008251static int
8252charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008253 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008255 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008256 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257{
8258 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008260 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008261 enum PyUnicode_Kind kind;
8262 void *data;
8263 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008265 Py_ssize_t collstartpos = *inpos;
8266 Py_ssize_t collendpos = *inpos+1;
8267 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 char *encoding = "charmap";
8269 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008271 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008272 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008274 if (PyUnicode_READY(unicode) < 0)
8275 return -1;
8276 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 /* find all unencodable characters */
8278 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008279 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008280 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008281 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008282 val = encoding_map_lookup(ch, mapping);
8283 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 break;
8285 ++collendpos;
8286 continue;
8287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8290 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 if (rep==NULL)
8292 return -1;
8293 else if (rep!=Py_None) {
8294 Py_DECREF(rep);
8295 break;
8296 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 }
8300 /* cache callback name lookup
8301 * (if not done yet, i.e. it's the first error) */
8302 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 if ((errors==NULL) || (!strcmp(errors, "strict")))
8304 *known_errorHandler = 1;
8305 else if (!strcmp(errors, "replace"))
8306 *known_errorHandler = 2;
8307 else if (!strcmp(errors, "ignore"))
8308 *known_errorHandler = 3;
8309 else if (!strcmp(errors, "xmlcharrefreplace"))
8310 *known_errorHandler = 4;
8311 else
8312 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 }
8314 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008315 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008316 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317 return -1;
8318 case 2: /* replace */
8319 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 x = charmapencode_output('?', mapping, res, respos);
8321 if (x==enc_EXCEPTION) {
8322 return -1;
8323 }
8324 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008325 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 return -1;
8327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008328 }
8329 /* fall through */
8330 case 3: /* ignore */
8331 *inpos = collendpos;
8332 break;
8333 case 4: /* xmlcharrefreplace */
8334 /* generate replacement (temporarily (mis)uses p) */
8335 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 char buffer[2+29+1+1];
8337 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008338 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 for (cp = buffer; *cp; ++cp) {
8340 x = charmapencode_output(*cp, mapping, res, respos);
8341 if (x==enc_EXCEPTION)
8342 return -1;
8343 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008344 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 return -1;
8346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008347 }
8348 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008349 *inpos = collendpos;
8350 break;
8351 default:
8352 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008353 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008357 if (PyBytes_Check(repunicode)) {
8358 /* Directly copy bytes result to output. */
8359 Py_ssize_t outsize = PyBytes_Size(*res);
8360 Py_ssize_t requiredsize;
8361 repsize = PyBytes_Size(repunicode);
8362 requiredsize = *respos + repsize;
8363 if (requiredsize > outsize)
8364 /* Make room for all additional bytes. */
8365 if (charmapencode_resize(res, respos, requiredsize)) {
8366 Py_DECREF(repunicode);
8367 return -1;
8368 }
8369 memcpy(PyBytes_AsString(*res) + *respos,
8370 PyBytes_AsString(repunicode), repsize);
8371 *respos += repsize;
8372 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008373 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008374 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008375 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008376 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008377 if (PyUnicode_READY(repunicode) < 0) {
8378 Py_DECREF(repunicode);
8379 return -1;
8380 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008381 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008382 data = PyUnicode_DATA(repunicode);
8383 kind = PyUnicode_KIND(repunicode);
8384 for (index = 0; index < repsize; index++) {
8385 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8386 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008388 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return -1;
8390 }
8391 else if (x==enc_FAILED) {
8392 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008393 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return -1;
8395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 }
8397 *inpos = newpos;
8398 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 }
8400 return 0;
8401}
8402
Alexander Belopolsky40018472011-02-26 01:02:56 +00008403PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008404_PyUnicode_EncodeCharmap(PyObject *unicode,
8405 PyObject *mapping,
8406 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 /* output object */
8409 PyObject *res = NULL;
8410 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008411 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008412 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008414 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 PyObject *errorHandler = NULL;
8416 PyObject *exc = NULL;
8417 /* the following variable is used for caching string comparisons
8418 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8419 * 3=ignore, 4=xmlcharrefreplace */
8420 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008422 if (PyUnicode_READY(unicode) < 0)
8423 return NULL;
8424 size = PyUnicode_GET_LENGTH(unicode);
8425
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 /* Default to Latin-1 */
8427 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008428 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 /* allocate enough for a simple encoding without
8431 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008432 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 if (res == NULL)
8434 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008435 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008439 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 if (x==enc_EXCEPTION) /* error */
8443 goto onError;
8444 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008445 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 &exc,
8447 &known_errorHandler, &errorHandler, errors,
8448 &res, &respos)) {
8449 goto onError;
8450 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 else
8453 /* done with this character => adjust input position */
8454 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008458 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008459 if (_PyBytes_Resize(&res, respos) < 0)
8460 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008461
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 Py_XDECREF(exc);
8463 Py_XDECREF(errorHandler);
8464 return res;
8465
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 Py_XDECREF(res);
8468 Py_XDECREF(exc);
8469 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470 return NULL;
8471}
8472
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008473/* Deprecated */
8474PyObject *
8475PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8476 Py_ssize_t size,
8477 PyObject *mapping,
8478 const char *errors)
8479{
8480 PyObject *result;
8481 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8482 if (unicode == NULL)
8483 return NULL;
8484 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8485 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008486 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008487}
8488
Alexander Belopolsky40018472011-02-26 01:02:56 +00008489PyObject *
8490PyUnicode_AsCharmapString(PyObject *unicode,
8491 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492{
8493 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 PyErr_BadArgument();
8495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498}
8499
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008500/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008501static void
8502make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504 Py_ssize_t startpos, Py_ssize_t endpos,
8505 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 *exceptionObject = _PyUnicodeTranslateError_Create(
8509 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510 }
8511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8513 goto onError;
8514 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8515 goto onError;
8516 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8517 goto onError;
8518 return;
8519 onError:
8520 Py_DECREF(*exceptionObject);
8521 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008522 }
8523}
8524
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008526static void
8527raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008529 Py_ssize_t startpos, Py_ssize_t endpos,
8530 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531{
8532 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008536}
8537
8538/* error handling callback helper:
8539 build arguments, call the callback and check the arguments,
8540 put the result into newpos and return the replacement string, which
8541 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008542static PyObject *
8543unicode_translate_call_errorhandler(const char *errors,
8544 PyObject **errorHandler,
8545 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008547 Py_ssize_t startpos, Py_ssize_t endpos,
8548 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008550 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008552 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 PyObject *restuple;
8554 PyObject *resunicode;
8555
8556 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 }
8561
8562 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566
8567 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008572 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 Py_DECREF(restuple);
8574 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 }
8576 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 &resunicode, &i_newpos)) {
8578 Py_DECREF(restuple);
8579 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008581 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008583 else
8584 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8587 Py_DECREF(restuple);
8588 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008589 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008590 Py_INCREF(resunicode);
8591 Py_DECREF(restuple);
8592 return resunicode;
8593}
8594
8595/* Lookup the character ch in the mapping and put the result in result,
8596 which must be decrefed by the caller.
8597 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008598static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600{
Christian Heimes217cfd12007-12-02 14:31:20 +00008601 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 PyObject *x;
8603
8604 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008606 x = PyObject_GetItem(mapping, w);
8607 Py_DECREF(w);
8608 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8610 /* No mapping found means: use 1:1 mapping. */
8611 PyErr_Clear();
8612 *result = NULL;
8613 return 0;
8614 } else
8615 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 }
8617 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 *result = x;
8619 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008621 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 long value = PyLong_AS_LONG(x);
8623 long max = PyUnicode_GetMax();
8624 if (value < 0 || value > max) {
8625 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008626 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 Py_DECREF(x);
8628 return -1;
8629 }
8630 *result = x;
8631 return 0;
8632 }
8633 else if (PyUnicode_Check(x)) {
8634 *result = x;
8635 return 0;
8636 }
8637 else {
8638 /* wrong return value */
8639 PyErr_SetString(PyExc_TypeError,
8640 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008641 Py_DECREF(x);
8642 return -1;
8643 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644}
8645/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 if not reallocate and adjust various state variables.
8647 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008648static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008653 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 /* exponentially overallocate to minimize reallocations */
8655 if (requiredsize < 2 * oldsize)
8656 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8658 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 }
8662 return 0;
8663}
8664/* lookup the character, put the result in the output string and adjust
8665 various state variables. Return a new reference to the object that
8666 was put in the output buffer in *result, or Py_None, if the mapping was
8667 undefined (in which case no character was written).
8668 The called must decref result.
8669 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8672 PyObject *mapping, Py_UCS4 **output,
8673 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008674 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8677 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 }
8683 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008685 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
8689 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 Py_ssize_t repsize;
8691 if (PyUnicode_READY(*res) == -1)
8692 return -1;
8693 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 if (repsize==1) {
8695 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 }
8698 else if (repsize!=0) {
8699 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 Py_ssize_t requiredsize = *opos +
8701 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 Py_ssize_t i;
8704 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 for(i = 0; i < repsize; i++)
8707 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 }
8710 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 return 0;
8713}
8714
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716_PyUnicode_TranslateCharmap(PyObject *input,
8717 PyObject *mapping,
8718 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720 /* input object */
8721 char *idata;
8722 Py_ssize_t size, i;
8723 int kind;
8724 /* output buffer */
8725 Py_UCS4 *output = NULL;
8726 Py_ssize_t osize;
8727 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 char *reason = "character maps to <undefined>";
8731 PyObject *errorHandler = NULL;
8732 PyObject *exc = NULL;
8733 /* the following variable is used for caching string comparisons
8734 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8735 * 3=ignore, 4=xmlcharrefreplace */
8736 int known_errorHandler = -1;
8737
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 PyErr_BadArgument();
8740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 if (PyUnicode_READY(input) == -1)
8744 return NULL;
8745 idata = (char*)PyUnicode_DATA(input);
8746 kind = PyUnicode_KIND(input);
8747 size = PyUnicode_GET_LENGTH(input);
8748 i = 0;
8749
8750 if (size == 0) {
8751 Py_INCREF(input);
8752 return input;
8753 }
8754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 /* allocate enough for a simple 1:1 translation without
8756 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 osize = size;
8758 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8759 opos = 0;
8760 if (output == NULL) {
8761 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 /* try to encode it */
8767 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 if (charmaptranslate_output(input, i, mapping,
8769 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 Py_XDECREF(x);
8771 goto onError;
8772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 else { /* untranslatable character */
8777 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8778 Py_ssize_t repsize;
8779 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008781 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 Py_ssize_t collstart = i;
8783 Py_ssize_t collend = i+1;
8784 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 while (collend < size) {
8788 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 goto onError;
8790 Py_XDECREF(x);
8791 if (x!=Py_None)
8792 break;
8793 ++collend;
8794 }
8795 /* cache callback name lookup
8796 * (if not done yet, i.e. it's the first error) */
8797 if (known_errorHandler==-1) {
8798 if ((errors==NULL) || (!strcmp(errors, "strict")))
8799 known_errorHandler = 1;
8800 else if (!strcmp(errors, "replace"))
8801 known_errorHandler = 2;
8802 else if (!strcmp(errors, "ignore"))
8803 known_errorHandler = 3;
8804 else if (!strcmp(errors, "xmlcharrefreplace"))
8805 known_errorHandler = 4;
8806 else
8807 known_errorHandler = 0;
8808 }
8809 switch (known_errorHandler) {
8810 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 raise_translate_exception(&exc, input, collstart,
8812 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008813 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 case 2: /* replace */
8815 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 for (coll = collstart; coll<collend; coll++)
8817 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 /* fall through */
8819 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 break;
8822 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 /* generate replacement (temporarily (mis)uses i) */
8824 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 char buffer[2+29+1+1];
8826 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8828 if (charmaptranslate_makespace(&output, &osize,
8829 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 goto onError;
8831 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 break;
8836 default:
8837 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 reason, input, &exc,
8839 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008840 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008842 if (PyUnicode_READY(repunicode) < 0) {
8843 Py_DECREF(repunicode);
8844 goto onError;
8845 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 repsize = PyUnicode_GET_LENGTH(repunicode);
8848 if (charmaptranslate_makespace(&output, &osize,
8849 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 Py_DECREF(repunicode);
8851 goto onError;
8852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 for (uni2 = 0; repsize-->0; ++uni2)
8854 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8855 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008858 }
8859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8861 if (!res)
8862 goto onError;
8863 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864 Py_XDECREF(exc);
8865 Py_XDECREF(errorHandler);
8866 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008869 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 Py_XDECREF(exc);
8871 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 return NULL;
8873}
8874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875/* Deprecated. Use PyUnicode_Translate instead. */
8876PyObject *
8877PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8878 Py_ssize_t size,
8879 PyObject *mapping,
8880 const char *errors)
8881{
8882 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8883 if (!unicode)
8884 return NULL;
8885 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8886}
8887
Alexander Belopolsky40018472011-02-26 01:02:56 +00008888PyObject *
8889PyUnicode_Translate(PyObject *str,
8890 PyObject *mapping,
8891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892{
8893 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008894
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895 str = PyUnicode_FromObject(str);
8896 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 Py_DECREF(str);
8900 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008901
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 Py_XDECREF(str);
8904 return NULL;
8905}
Tim Petersced69f82003-09-16 20:30:58 +00008906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008908fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909{
8910 /* No need to call PyUnicode_READY(self) because this function is only
8911 called as a callback from fixup() which does it already. */
8912 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8913 const int kind = PyUnicode_KIND(self);
8914 void *data = PyUnicode_DATA(self);
8915 Py_UCS4 maxchar = 0, ch, fixed;
8916 Py_ssize_t i;
8917
8918 for (i = 0; i < len; ++i) {
8919 ch = PyUnicode_READ(kind, data, i);
8920 fixed = 0;
8921 if (ch > 127) {
8922 if (Py_UNICODE_ISSPACE(ch))
8923 fixed = ' ';
8924 else {
8925 const int decimal = Py_UNICODE_TODECIMAL(ch);
8926 if (decimal >= 0)
8927 fixed = '0' + decimal;
8928 }
8929 if (fixed != 0) {
8930 if (fixed > maxchar)
8931 maxchar = fixed;
8932 PyUnicode_WRITE(kind, data, i, fixed);
8933 }
8934 else if (ch > maxchar)
8935 maxchar = ch;
8936 }
8937 else if (ch > maxchar)
8938 maxchar = ch;
8939 }
8940
8941 return maxchar;
8942}
8943
8944PyObject *
8945_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8946{
8947 if (!PyUnicode_Check(unicode)) {
8948 PyErr_BadInternalCall();
8949 return NULL;
8950 }
8951 if (PyUnicode_READY(unicode) == -1)
8952 return NULL;
8953 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8954 /* If the string is already ASCII, just return the same string */
8955 Py_INCREF(unicode);
8956 return unicode;
8957 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008958 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959}
8960
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008961PyObject *
8962PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8963 Py_ssize_t length)
8964{
Victor Stinnerf0124502011-11-21 23:12:56 +01008965 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008966 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008967 Py_UCS4 maxchar;
8968 enum PyUnicode_Kind kind;
8969 void *data;
8970
8971 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008972 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008973 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008974 if (ch > 127) {
8975 int decimal = Py_UNICODE_TODECIMAL(ch);
8976 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008977 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008978 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008979 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008980 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008981
8982 /* Copy to a new string */
8983 decimal = PyUnicode_New(length, maxchar);
8984 if (decimal == NULL)
8985 return decimal;
8986 kind = PyUnicode_KIND(decimal);
8987 data = PyUnicode_DATA(decimal);
8988 /* Iterate over code points */
8989 for (i = 0; i < length; i++) {
8990 Py_UNICODE ch = s[i];
8991 if (ch > 127) {
8992 int decimal = Py_UNICODE_TODECIMAL(ch);
8993 if (decimal >= 0)
8994 ch = '0' + decimal;
8995 }
8996 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008998 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008999}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009000/* --- Decimal Encoder ---------------------------------------------------- */
9001
Alexander Belopolsky40018472011-02-26 01:02:56 +00009002int
9003PyUnicode_EncodeDecimal(Py_UNICODE *s,
9004 Py_ssize_t length,
9005 char *output,
9006 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009007{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009008 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009009 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009010 enum PyUnicode_Kind kind;
9011 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009012
9013 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 PyErr_BadArgument();
9015 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009016 }
9017
Victor Stinner42bf7752011-11-21 22:52:58 +01009018 unicode = PyUnicode_FromUnicode(s, length);
9019 if (unicode == NULL)
9020 return -1;
9021
Victor Stinner6345be92011-11-25 20:09:01 +01009022 if (PyUnicode_READY(unicode) < 0) {
9023 Py_DECREF(unicode);
9024 return -1;
9025 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009026 kind = PyUnicode_KIND(unicode);
9027 data = PyUnicode_DATA(unicode);
9028
Victor Stinnerb84d7232011-11-22 01:50:07 +01009029 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009030 PyObject *exc;
9031 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009033 Py_ssize_t startpos;
9034
9035 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009036
Benjamin Peterson29060642009-01-31 22:14:21 +00009037 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009038 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009039 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009041 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 decimal = Py_UNICODE_TODECIMAL(ch);
9043 if (decimal >= 0) {
9044 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009045 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 continue;
9047 }
9048 if (0 < ch && ch < 256) {
9049 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009050 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 continue;
9052 }
Victor Stinner6345be92011-11-25 20:09:01 +01009053
Victor Stinner42bf7752011-11-21 22:52:58 +01009054 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009055 exc = NULL;
9056 raise_encode_exception(&exc, "decimal", unicode,
9057 startpos, startpos+1,
9058 "invalid decimal Unicode string");
9059 Py_XDECREF(exc);
9060 Py_DECREF(unicode);
9061 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009062 }
9063 /* 0-terminate the output string */
9064 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009065 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009066 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009067}
9068
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069/* --- Helpers ------------------------------------------------------------ */
9070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009072any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 Py_ssize_t start,
9074 Py_ssize_t end)
9075{
9076 int kind1, kind2, kind;
9077 void *buf1, *buf2;
9078 Py_ssize_t len1, len2, result;
9079
9080 kind1 = PyUnicode_KIND(s1);
9081 kind2 = PyUnicode_KIND(s2);
9082 kind = kind1 > kind2 ? kind1 : kind2;
9083 buf1 = PyUnicode_DATA(s1);
9084 buf2 = PyUnicode_DATA(s2);
9085 if (kind1 != kind)
9086 buf1 = _PyUnicode_AsKind(s1, kind);
9087 if (!buf1)
9088 return -2;
9089 if (kind2 != kind)
9090 buf2 = _PyUnicode_AsKind(s2, kind);
9091 if (!buf2) {
9092 if (kind1 != kind) PyMem_Free(buf1);
9093 return -2;
9094 }
9095 len1 = PyUnicode_GET_LENGTH(s1);
9096 len2 = PyUnicode_GET_LENGTH(s2);
9097
Victor Stinner794d5672011-10-10 03:21:36 +02009098 if (direction > 0) {
9099 switch(kind) {
9100 case PyUnicode_1BYTE_KIND:
9101 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9102 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9103 else
9104 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9105 break;
9106 case PyUnicode_2BYTE_KIND:
9107 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9108 break;
9109 case PyUnicode_4BYTE_KIND:
9110 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9111 break;
9112 default:
9113 assert(0); result = -2;
9114 }
9115 }
9116 else {
9117 switch(kind) {
9118 case PyUnicode_1BYTE_KIND:
9119 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9120 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9121 else
9122 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9123 break;
9124 case PyUnicode_2BYTE_KIND:
9125 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9126 break;
9127 case PyUnicode_4BYTE_KIND:
9128 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9129 break;
9130 default:
9131 assert(0); result = -2;
9132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 }
9134
9135 if (kind1 != kind)
9136 PyMem_Free(buf1);
9137 if (kind2 != kind)
9138 PyMem_Free(buf2);
9139
9140 return result;
9141}
9142
9143Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009144_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 Py_ssize_t n_buffer,
9146 void *digits, Py_ssize_t n_digits,
9147 Py_ssize_t min_width,
9148 const char *grouping,
9149 const char *thousands_sep)
9150{
9151 switch(kind) {
9152 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009153 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9154 return _PyUnicode_ascii_InsertThousandsGrouping(
9155 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9156 min_width, grouping, thousands_sep);
9157 else
9158 return _PyUnicode_ucs1_InsertThousandsGrouping(
9159 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9160 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 case PyUnicode_2BYTE_KIND:
9162 return _PyUnicode_ucs2_InsertThousandsGrouping(
9163 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9164 min_width, grouping, thousands_sep);
9165 case PyUnicode_4BYTE_KIND:
9166 return _PyUnicode_ucs4_InsertThousandsGrouping(
9167 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9168 min_width, grouping, thousands_sep);
9169 }
9170 assert(0);
9171 return -1;
9172}
9173
9174
Thomas Wouters477c8d52006-05-27 19:21:47 +00009175/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009176#define ADJUST_INDICES(start, end, len) \
9177 if (end > len) \
9178 end = len; \
9179 else if (end < 0) { \
9180 end += len; \
9181 if (end < 0) \
9182 end = 0; \
9183 } \
9184 if (start < 0) { \
9185 start += len; \
9186 if (start < 0) \
9187 start = 0; \
9188 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189
Alexander Belopolsky40018472011-02-26 01:02:56 +00009190Py_ssize_t
9191PyUnicode_Count(PyObject *str,
9192 PyObject *substr,
9193 Py_ssize_t start,
9194 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009196 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009197 PyObject* str_obj;
9198 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 int kind1, kind2, kind;
9200 void *buf1 = NULL, *buf2 = NULL;
9201 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009202
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009203 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009206 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009207 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 Py_DECREF(str_obj);
9209 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210 }
Tim Petersced69f82003-09-16 20:30:58 +00009211
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 kind1 = PyUnicode_KIND(str_obj);
9213 kind2 = PyUnicode_KIND(sub_obj);
9214 kind = kind1 > kind2 ? kind1 : kind2;
9215 buf1 = PyUnicode_DATA(str_obj);
9216 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009217 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009218 if (!buf1)
9219 goto onError;
9220 buf2 = PyUnicode_DATA(sub_obj);
9221 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009222 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 if (!buf2)
9224 goto onError;
9225 len1 = PyUnicode_GET_LENGTH(str_obj);
9226 len2 = PyUnicode_GET_LENGTH(sub_obj);
9227
9228 ADJUST_INDICES(start, end, len1);
9229 switch(kind) {
9230 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009231 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9232 result = asciilib_count(
9233 ((Py_UCS1*)buf1) + start, end - start,
9234 buf2, len2, PY_SSIZE_T_MAX
9235 );
9236 else
9237 result = ucs1lib_count(
9238 ((Py_UCS1*)buf1) + start, end - start,
9239 buf2, len2, PY_SSIZE_T_MAX
9240 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009241 break;
9242 case PyUnicode_2BYTE_KIND:
9243 result = ucs2lib_count(
9244 ((Py_UCS2*)buf1) + start, end - start,
9245 buf2, len2, PY_SSIZE_T_MAX
9246 );
9247 break;
9248 case PyUnicode_4BYTE_KIND:
9249 result = ucs4lib_count(
9250 ((Py_UCS4*)buf1) + start, end - start,
9251 buf2, len2, PY_SSIZE_T_MAX
9252 );
9253 break;
9254 default:
9255 assert(0); result = 0;
9256 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009257
9258 Py_DECREF(sub_obj);
9259 Py_DECREF(str_obj);
9260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 if (kind1 != kind)
9262 PyMem_Free(buf1);
9263 if (kind2 != kind)
9264 PyMem_Free(buf2);
9265
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 onError:
9268 Py_DECREF(sub_obj);
9269 Py_DECREF(str_obj);
9270 if (kind1 != kind && buf1)
9271 PyMem_Free(buf1);
9272 if (kind2 != kind && buf2)
9273 PyMem_Free(buf2);
9274 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275}
9276
Alexander Belopolsky40018472011-02-26 01:02:56 +00009277Py_ssize_t
9278PyUnicode_Find(PyObject *str,
9279 PyObject *sub,
9280 Py_ssize_t start,
9281 Py_ssize_t end,
9282 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009284 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009285
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009289 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 Py_DECREF(str);
9292 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 }
Tim Petersced69f82003-09-16 20:30:58 +00009294
Victor Stinner794d5672011-10-10 03:21:36 +02009295 result = any_find_slice(direction,
9296 str, sub, start, end
9297 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009298
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009300 Py_DECREF(sub);
9301
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 return result;
9303}
9304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305Py_ssize_t
9306PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9307 Py_ssize_t start, Py_ssize_t end,
9308 int direction)
9309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009311 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 if (PyUnicode_READY(str) == -1)
9313 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009314 if (start < 0 || end < 0) {
9315 PyErr_SetString(PyExc_IndexError, "string index out of range");
9316 return -2;
9317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 if (end > PyUnicode_GET_LENGTH(str))
9319 end = PyUnicode_GET_LENGTH(str);
9320 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009321 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9322 kind, end-start, ch, direction);
9323 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009325 else
9326 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327}
9328
Alexander Belopolsky40018472011-02-26 01:02:56 +00009329static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009330tailmatch(PyObject *self,
9331 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009332 Py_ssize_t start,
9333 Py_ssize_t end,
9334 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 int kind_self;
9337 int kind_sub;
9338 void *data_self;
9339 void *data_sub;
9340 Py_ssize_t offset;
9341 Py_ssize_t i;
9342 Py_ssize_t end_sub;
9343
9344 if (PyUnicode_READY(self) == -1 ||
9345 PyUnicode_READY(substring) == -1)
9346 return 0;
9347
9348 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 return 1;
9350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9352 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 kind_self = PyUnicode_KIND(self);
9357 data_self = PyUnicode_DATA(self);
9358 kind_sub = PyUnicode_KIND(substring);
9359 data_sub = PyUnicode_DATA(substring);
9360 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9361
9362 if (direction > 0)
9363 offset = end;
9364 else
9365 offset = start;
9366
9367 if (PyUnicode_READ(kind_self, data_self, offset) ==
9368 PyUnicode_READ(kind_sub, data_sub, 0) &&
9369 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9370 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9371 /* If both are of the same kind, memcmp is sufficient */
9372 if (kind_self == kind_sub) {
9373 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009374 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 data_sub,
9376 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009377 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 }
9379 /* otherwise we have to compare each character by first accesing it */
9380 else {
9381 /* We do not need to compare 0 and len(substring)-1 because
9382 the if statement above ensured already that they are equal
9383 when we end up here. */
9384 // TODO: honor direction and do a forward or backwards search
9385 for (i = 1; i < end_sub; ++i) {
9386 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9387 PyUnicode_READ(kind_sub, data_sub, i))
9388 return 0;
9389 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
9393
9394 return 0;
9395}
9396
Alexander Belopolsky40018472011-02-26 01:02:56 +00009397Py_ssize_t
9398PyUnicode_Tailmatch(PyObject *str,
9399 PyObject *substr,
9400 Py_ssize_t start,
9401 Py_ssize_t end,
9402 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009404 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009405
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406 str = PyUnicode_FromObject(str);
9407 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009408 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 substr = PyUnicode_FromObject(substr);
9410 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 Py_DECREF(str);
9412 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413 }
Tim Petersced69f82003-09-16 20:30:58 +00009414
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009415 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417 Py_DECREF(str);
9418 Py_DECREF(substr);
9419 return result;
9420}
9421
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422/* Apply fixfct filter to the Unicode object self and return a
9423 reference to the modified object */
9424
Alexander Belopolsky40018472011-02-26 01:02:56 +00009425static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009426fixup(PyObject *self,
9427 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 PyObject *u;
9430 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009431 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009433 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009436 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 /* fix functions return the new maximum character in a string,
9439 if the kind of the resulting unicode object does not change,
9440 everything is fine. Otherwise we need to change the string kind
9441 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009442 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009443
9444 if (maxchar_new == 0) {
9445 /* no changes */;
9446 if (PyUnicode_CheckExact(self)) {
9447 Py_DECREF(u);
9448 Py_INCREF(self);
9449 return self;
9450 }
9451 else
9452 return u;
9453 }
9454
9455 if (maxchar_new <= 127)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 maxchar_new = 127;
9457 else if (maxchar_new <= 255)
9458 maxchar_new = 255;
9459 else if (maxchar_new <= 65535)
9460 maxchar_new = 65535;
9461 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009462 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463
Victor Stinnereaab6042011-12-11 22:22:39 +01009464 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009466
9467 /* In case the maximum character changed, we need to
9468 convert the string to the new category. */
9469 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9470 if (v == NULL) {
9471 Py_DECREF(u);
9472 return NULL;
9473 }
9474 if (maxchar_new > maxchar_old) {
9475 /* If the maxchar increased so that the kind changed, not all
9476 characters are representable anymore and we need to fix the
9477 string again. This only happens in very few cases. */
9478 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9479 maxchar_old = fixfct(v);
9480 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 }
9482 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009483 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009485 Py_DECREF(u);
9486 assert(_PyUnicode_CheckConsistency(v, 1));
9487 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009491fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 /* No need to call PyUnicode_READY(self) because this function is only
9494 called as a callback from fixup() which does it already. */
9495 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9496 const int kind = PyUnicode_KIND(self);
9497 void *data = PyUnicode_DATA(self);
9498 int touched = 0;
9499 Py_UCS4 maxchar = 0;
9500 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 for (i = 0; i < len; ++i) {
9503 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9504 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9505 if (up != ch) {
9506 if (up > maxchar)
9507 maxchar = up;
9508 PyUnicode_WRITE(kind, data, i, up);
9509 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 else if (ch > maxchar)
9512 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
9514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 if (touched)
9516 return maxchar;
9517 else
9518 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519}
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009522fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9525 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9526 const int kind = PyUnicode_KIND(self);
9527 void *data = PyUnicode_DATA(self);
9528 int touched = 0;
9529 Py_UCS4 maxchar = 0;
9530 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 for(i = 0; i < len; ++i) {
9533 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9534 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9535 if (lo != ch) {
9536 if (lo > maxchar)
9537 maxchar = lo;
9538 PyUnicode_WRITE(kind, data, i, lo);
9539 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 else if (ch > maxchar)
9542 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009543 }
9544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 if (touched)
9546 return maxchar;
9547 else
9548 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549}
9550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009552fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9555 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9556 const int kind = PyUnicode_KIND(self);
9557 void *data = PyUnicode_DATA(self);
9558 int touched = 0;
9559 Py_UCS4 maxchar = 0;
9560 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 for(i = 0; i < len; ++i) {
9563 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9564 Py_UCS4 nu = 0;
9565
9566 if (Py_UNICODE_ISUPPER(ch))
9567 nu = Py_UNICODE_TOLOWER(ch);
9568 else if (Py_UNICODE_ISLOWER(ch))
9569 nu = Py_UNICODE_TOUPPER(ch);
9570
9571 if (nu != 0) {
9572 if (nu > maxchar)
9573 maxchar = nu;
9574 PyUnicode_WRITE(kind, data, i, nu);
9575 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 else if (ch > maxchar)
9578 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579 }
9580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 if (touched)
9582 return maxchar;
9583 else
9584 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585}
9586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009587static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009588fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9591 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9592 const int kind = PyUnicode_KIND(self);
9593 void *data = PyUnicode_DATA(self);
9594 int touched = 0;
9595 Py_UCS4 maxchar = 0;
9596 Py_ssize_t i = 0;
9597 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009598
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009599 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009600 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601
9602 ch = PyUnicode_READ(kind, data, i);
9603 if (!Py_UNICODE_ISUPPER(ch)) {
9604 maxchar = Py_UNICODE_TOUPPER(ch);
9605 PyUnicode_WRITE(kind, data, i, maxchar);
9606 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 ++i;
9609 for(; i < len; ++i) {
9610 ch = PyUnicode_READ(kind, data, i);
9611 if (!Py_UNICODE_ISLOWER(ch)) {
9612 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9613 if (lo > maxchar)
9614 maxchar = lo;
9615 PyUnicode_WRITE(kind, data, i, lo);
9616 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 else if (ch > maxchar)
9619 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621
9622 if (touched)
9623 return maxchar;
9624 else
9625 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626}
9627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009629fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9632 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9633 const int kind = PyUnicode_KIND(self);
9634 void *data = PyUnicode_DATA(self);
9635 Py_UCS4 maxchar = 0;
9636 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637 int previous_is_cased;
9638
9639 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 if (len == 1) {
9641 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9642 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9643 if (ti != ch) {
9644 PyUnicode_WRITE(kind, data, i, ti);
9645 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009646 }
9647 else
9648 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009650 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 for(; i < len; ++i) {
9652 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9653 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009654
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 nu = Py_UNICODE_TOTITLE(ch);
9659
9660 if (nu > maxchar)
9661 maxchar = nu;
9662 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009663
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 if (Py_UNICODE_ISLOWER(ch) ||
9665 Py_UNICODE_ISUPPER(ch) ||
9666 Py_UNICODE_ISTITLE(ch))
9667 previous_is_cased = 1;
9668 else
9669 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672}
9673
Tim Peters8ce9f162004-08-27 01:49:32 +00009674PyObject *
9675PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009678 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009680 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009681 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9682 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009683 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009685 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 int use_memcpy;
9688 unsigned char *res_data = NULL, *sep_data = NULL;
9689 PyObject *last_obj;
9690 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691
Tim Peters05eba1f2004-08-27 21:32:02 +00009692 fseq = PySequence_Fast(seq, "");
9693 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009695 }
9696
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 /* NOTE: the following code can't call back into Python code,
9698 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009699 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009700
Tim Peters05eba1f2004-08-27 21:32:02 +00009701 seqlen = PySequence_Fast_GET_SIZE(fseq);
9702 /* If empty sequence, return u"". */
9703 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009704 Py_DECREF(fseq);
9705 Py_INCREF(unicode_empty);
9706 res = unicode_empty;
9707 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009708 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009709
Tim Peters05eba1f2004-08-27 21:32:02 +00009710 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009711 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009712 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009713 if (seqlen == 1) {
9714 if (PyUnicode_CheckExact(items[0])) {
9715 res = items[0];
9716 Py_INCREF(res);
9717 Py_DECREF(fseq);
9718 return res;
9719 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009720 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009721 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009722 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009724 /* Set up sep and seplen */
9725 if (separator == NULL) {
9726 /* fall back to a blank space separator */
9727 sep = PyUnicode_FromOrdinal(' ');
9728 if (!sep)
9729 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009730 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009731 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009732 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009733 else {
9734 if (!PyUnicode_Check(separator)) {
9735 PyErr_Format(PyExc_TypeError,
9736 "separator: expected str instance,"
9737 " %.80s found",
9738 Py_TYPE(separator)->tp_name);
9739 goto onError;
9740 }
9741 if (PyUnicode_READY(separator))
9742 goto onError;
9743 sep = separator;
9744 seplen = PyUnicode_GET_LENGTH(separator);
9745 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9746 /* inc refcount to keep this code path symmetric with the
9747 above case of a blank separator */
9748 Py_INCREF(sep);
9749 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009751 }
9752
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009753 /* There are at least two things to join, or else we have a subclass
9754 * of str in the sequence.
9755 * Do a pre-pass to figure out the total amount of space we'll
9756 * need (sz), and see whether all argument are strings.
9757 */
9758 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009759#ifdef Py_DEBUG
9760 use_memcpy = 0;
9761#else
9762 use_memcpy = 1;
9763#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009764 for (i = 0; i < seqlen; i++) {
9765 const Py_ssize_t old_sz = sz;
9766 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 if (!PyUnicode_Check(item)) {
9768 PyErr_Format(PyExc_TypeError,
9769 "sequence item %zd: expected str instance,"
9770 " %.80s found",
9771 i, Py_TYPE(item)->tp_name);
9772 goto onError;
9773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 if (PyUnicode_READY(item) == -1)
9775 goto onError;
9776 sz += PyUnicode_GET_LENGTH(item);
9777 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009778 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009779 if (i != 0)
9780 sz += seplen;
9781 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9782 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009784 goto onError;
9785 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009786 if (use_memcpy && last_obj != NULL) {
9787 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9788 use_memcpy = 0;
9789 }
9790 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009791 }
Tim Petersced69f82003-09-16 20:30:58 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009794 if (res == NULL)
9795 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009796
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009797 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009798#ifdef Py_DEBUG
9799 use_memcpy = 0;
9800#else
9801 if (use_memcpy) {
9802 res_data = PyUnicode_1BYTE_DATA(res);
9803 kind = PyUnicode_KIND(res);
9804 if (seplen != 0)
9805 sep_data = PyUnicode_1BYTE_DATA(sep);
9806 }
9807#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009809 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009810 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009812 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 if (use_memcpy) {
9814 Py_MEMCPY(res_data,
9815 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009816 kind * seplen);
9817 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009818 }
9819 else {
9820 copy_characters(res, res_offset, sep, 0, seplen);
9821 res_offset += seplen;
9822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009823 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009824 itemlen = PyUnicode_GET_LENGTH(item);
9825 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009826 if (use_memcpy) {
9827 Py_MEMCPY(res_data,
9828 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009829 kind * itemlen);
9830 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009831 }
9832 else {
9833 copy_characters(res, res_offset, item, 0, itemlen);
9834 res_offset += itemlen;
9835 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009836 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009837 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009838 if (use_memcpy)
9839 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009840 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009841 else
9842 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009843
Tim Peters05eba1f2004-08-27 21:32:02 +00009844 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009846 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848
Benjamin Peterson29060642009-01-31 22:14:21 +00009849 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009850 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009852 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853 return NULL;
9854}
9855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856#define FILL(kind, data, value, start, length) \
9857 do { \
9858 Py_ssize_t i_ = 0; \
9859 assert(kind != PyUnicode_WCHAR_KIND); \
9860 switch ((kind)) { \
9861 case PyUnicode_1BYTE_KIND: { \
9862 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9863 memset(to_, (unsigned char)value, length); \
9864 break; \
9865 } \
9866 case PyUnicode_2BYTE_KIND: { \
9867 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9868 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9869 break; \
9870 } \
9871 default: { \
9872 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9873 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9874 break; \
9875 } \
9876 } \
9877 } while (0)
9878
Victor Stinner9310abb2011-10-05 00:59:23 +02009879static PyObject *
9880pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009881 Py_ssize_t left,
9882 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 PyObject *u;
9886 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009887 int kind;
9888 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889
9890 if (left < 0)
9891 left = 0;
9892 if (right < 0)
9893 right = 0;
9894
Victor Stinnerc4b49542011-12-11 22:44:26 +01009895 if (left == 0 && right == 0)
9896 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9899 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009900 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9901 return NULL;
9902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9904 if (fill > maxchar)
9905 maxchar = fill;
9906 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009907 if (!u)
9908 return NULL;
9909
9910 kind = PyUnicode_KIND(u);
9911 data = PyUnicode_DATA(u);
9912 if (left)
9913 FILL(kind, data, fill, 0, left);
9914 if (right)
9915 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009916 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009917 assert(_PyUnicode_CheckConsistency(u, 1));
9918 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921
Alexander Belopolsky40018472011-02-26 01:02:56 +00009922PyObject *
9923PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926
9927 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 switch(PyUnicode_KIND(string)) {
9932 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009933 if (PyUnicode_IS_ASCII(string))
9934 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009935 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009936 PyUnicode_GET_LENGTH(string), keepends);
9937 else
9938 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009939 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009940 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 break;
9942 case PyUnicode_2BYTE_KIND:
9943 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 PyUnicode_GET_LENGTH(string), keepends);
9946 break;
9947 case PyUnicode_4BYTE_KIND:
9948 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009949 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 PyUnicode_GET_LENGTH(string), keepends);
9951 break;
9952 default:
9953 assert(0);
9954 list = 0;
9955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009956 Py_DECREF(string);
9957 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958}
9959
Alexander Belopolsky40018472011-02-26 01:02:56 +00009960static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009961split(PyObject *self,
9962 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009963 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 int kind1, kind2, kind;
9966 void *buf1, *buf2;
9967 Py_ssize_t len1, len2;
9968 PyObject* out;
9969
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009971 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (PyUnicode_READY(self) == -1)
9974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 if (substring == NULL)
9977 switch(PyUnicode_KIND(self)) {
9978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009979 if (PyUnicode_IS_ASCII(self))
9980 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 PyUnicode_GET_LENGTH(self), maxcount
9983 );
9984 else
9985 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 PyUnicode_GET_LENGTH(self), maxcount
9988 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 case PyUnicode_2BYTE_KIND:
9990 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 PyUnicode_GET_LENGTH(self), maxcount
9993 );
9994 case PyUnicode_4BYTE_KIND:
9995 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009996 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 PyUnicode_GET_LENGTH(self), maxcount
9998 );
9999 default:
10000 assert(0);
10001 return NULL;
10002 }
10003
10004 if (PyUnicode_READY(substring) == -1)
10005 return NULL;
10006
10007 kind1 = PyUnicode_KIND(self);
10008 kind2 = PyUnicode_KIND(substring);
10009 kind = kind1 > kind2 ? kind1 : kind2;
10010 buf1 = PyUnicode_DATA(self);
10011 buf2 = PyUnicode_DATA(substring);
10012 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010013 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 if (!buf1)
10015 return NULL;
10016 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 if (!buf2) {
10019 if (kind1 != kind) PyMem_Free(buf1);
10020 return NULL;
10021 }
10022 len1 = PyUnicode_GET_LENGTH(self);
10023 len2 = PyUnicode_GET_LENGTH(substring);
10024
10025 switch(kind) {
10026 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010027 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10028 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010030 else
10031 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010032 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 break;
10034 case PyUnicode_2BYTE_KIND:
10035 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010036 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 break;
10038 case PyUnicode_4BYTE_KIND:
10039 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010040 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 break;
10042 default:
10043 out = NULL;
10044 }
10045 if (kind1 != kind)
10046 PyMem_Free(buf1);
10047 if (kind2 != kind)
10048 PyMem_Free(buf2);
10049 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050}
10051
Alexander Belopolsky40018472011-02-26 01:02:56 +000010052static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010053rsplit(PyObject *self,
10054 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010055 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 int kind1, kind2, kind;
10058 void *buf1, *buf2;
10059 Py_ssize_t len1, len2;
10060 PyObject* out;
10061
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010062 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010063 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 if (PyUnicode_READY(self) == -1)
10066 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 if (substring == NULL)
10069 switch(PyUnicode_KIND(self)) {
10070 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010071 if (PyUnicode_IS_ASCII(self))
10072 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074 PyUnicode_GET_LENGTH(self), maxcount
10075 );
10076 else
10077 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010078 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079 PyUnicode_GET_LENGTH(self), maxcount
10080 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 case PyUnicode_2BYTE_KIND:
10082 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010083 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 PyUnicode_GET_LENGTH(self), maxcount
10085 );
10086 case PyUnicode_4BYTE_KIND:
10087 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 PyUnicode_GET_LENGTH(self), maxcount
10090 );
10091 default:
10092 assert(0);
10093 return NULL;
10094 }
10095
10096 if (PyUnicode_READY(substring) == -1)
10097 return NULL;
10098
10099 kind1 = PyUnicode_KIND(self);
10100 kind2 = PyUnicode_KIND(substring);
10101 kind = kind1 > kind2 ? kind1 : kind2;
10102 buf1 = PyUnicode_DATA(self);
10103 buf2 = PyUnicode_DATA(substring);
10104 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010105 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (!buf1)
10107 return NULL;
10108 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010109 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (!buf2) {
10111 if (kind1 != kind) PyMem_Free(buf1);
10112 return NULL;
10113 }
10114 len1 = PyUnicode_GET_LENGTH(self);
10115 len2 = PyUnicode_GET_LENGTH(substring);
10116
10117 switch(kind) {
10118 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10120 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010121 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010122 else
10123 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010124 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 break;
10126 case PyUnicode_2BYTE_KIND:
10127 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010128 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 break;
10130 case PyUnicode_4BYTE_KIND:
10131 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010132 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 break;
10134 default:
10135 out = NULL;
10136 }
10137 if (kind1 != kind)
10138 PyMem_Free(buf1);
10139 if (kind2 != kind)
10140 PyMem_Free(buf2);
10141 return out;
10142}
10143
10144static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010145anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10146 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147{
10148 switch(kind) {
10149 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10151 return asciilib_find(buf1, len1, buf2, len2, offset);
10152 else
10153 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 case PyUnicode_2BYTE_KIND:
10155 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10156 case PyUnicode_4BYTE_KIND:
10157 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10158 }
10159 assert(0);
10160 return -1;
10161}
10162
10163static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10165 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166{
10167 switch(kind) {
10168 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10170 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10171 else
10172 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 case PyUnicode_2BYTE_KIND:
10174 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10175 case PyUnicode_4BYTE_KIND:
10176 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10177 }
10178 assert(0);
10179 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010180}
10181
Alexander Belopolsky40018472011-02-26 01:02:56 +000010182static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183replace(PyObject *self, PyObject *str1,
10184 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 PyObject *u;
10187 char *sbuf = PyUnicode_DATA(self);
10188 char *buf1 = PyUnicode_DATA(str1);
10189 char *buf2 = PyUnicode_DATA(str2);
10190 int srelease = 0, release1 = 0, release2 = 0;
10191 int skind = PyUnicode_KIND(self);
10192 int kind1 = PyUnicode_KIND(str1);
10193 int kind2 = PyUnicode_KIND(str2);
10194 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10195 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10196 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 int mayshrink;
10198 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199
10200 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010201 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010203 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Victor Stinner59de0ee2011-10-07 10:01:28 +020010205 if (str1 == str2)
10206 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 if (skind < kind1)
10208 /* substring too wide to be present */
10209 goto nothing;
10210
Victor Stinner49a0a212011-10-12 23:46:10 +020010211 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10212 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10213 /* Replacing str1 with str2 may cause a maxchar reduction in the
10214 result string. */
10215 mayshrink = (maxchar_str2 < maxchar);
10216 maxchar = Py_MAX(maxchar, maxchar_str2);
10217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010219 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010222 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010225 Py_UCS4 u1, u2;
10226 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010228 if (findchar(sbuf, PyUnicode_KIND(self),
10229 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010233 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010235 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 rkind = PyUnicode_KIND(u);
10237 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10238 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010239 if (--maxcount < 0)
10240 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010243 }
10244 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 int rkind = skind;
10246 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (kind1 < rkind) {
10249 /* widen substring */
10250 buf1 = _PyUnicode_AsKind(str1, rkind);
10251 if (!buf1) goto error;
10252 release1 = 1;
10253 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 if (i < 0)
10256 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (rkind > kind2) {
10258 /* widen replacement */
10259 buf2 = _PyUnicode_AsKind(str2, rkind);
10260 if (!buf2) goto error;
10261 release2 = 1;
10262 }
10263 else if (rkind < kind2) {
10264 /* widen self and buf1 */
10265 rkind = kind2;
10266 if (release1) PyMem_Free(buf1);
10267 sbuf = _PyUnicode_AsKind(self, rkind);
10268 if (!sbuf) goto error;
10269 srelease = 1;
10270 buf1 = _PyUnicode_AsKind(str1, rkind);
10271 if (!buf1) goto error;
10272 release1 = 1;
10273 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010274 u = PyUnicode_New(slen, maxchar);
10275 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010277 assert(PyUnicode_KIND(u) == rkind);
10278 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010279
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010280 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010281 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010282 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010284 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010286
10287 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010288 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010289 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010290 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010291 if (i == -1)
10292 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010293 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010295 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010299 }
10300 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 Py_ssize_t n, i, j, ires;
10302 Py_ssize_t product, new_size;
10303 int rkind = skind;
10304 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010307 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 buf1 = _PyUnicode_AsKind(str1, rkind);
10309 if (!buf1) goto error;
10310 release1 = 1;
10311 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010312 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 if (n == 0)
10314 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010316 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 buf2 = _PyUnicode_AsKind(str2, rkind);
10318 if (!buf2) goto error;
10319 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010322 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 rkind = kind2;
10324 sbuf = _PyUnicode_AsKind(self, rkind);
10325 if (!sbuf) goto error;
10326 srelease = 1;
10327 if (release1) PyMem_Free(buf1);
10328 buf1 = _PyUnicode_AsKind(str1, rkind);
10329 if (!buf1) goto error;
10330 release1 = 1;
10331 }
10332 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10333 PyUnicode_GET_LENGTH(str1))); */
10334 product = n * (len2-len1);
10335 if ((product / (len2-len1)) != n) {
10336 PyErr_SetString(PyExc_OverflowError,
10337 "replace string is too long");
10338 goto error;
10339 }
10340 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010341 if (new_size == 0) {
10342 Py_INCREF(unicode_empty);
10343 u = unicode_empty;
10344 goto done;
10345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10347 PyErr_SetString(PyExc_OverflowError,
10348 "replace string is too long");
10349 goto error;
10350 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010351 u = PyUnicode_New(new_size, maxchar);
10352 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010354 assert(PyUnicode_KIND(u) == rkind);
10355 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 ires = i = 0;
10357 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010358 while (n-- > 0) {
10359 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010360 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010362 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010363 if (j == -1)
10364 break;
10365 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 memcpy(res + rkind * ires,
10368 sbuf + rkind * i,
10369 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010371 }
10372 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010374 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010382 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010383 memcpy(res + rkind * ires,
10384 sbuf + rkind * i,
10385 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010386 }
10387 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010388 /* interleave */
10389 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010390 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010392 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010394 if (--n <= 0)
10395 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010396 memcpy(res + rkind * ires,
10397 sbuf + rkind * i,
10398 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 ires++;
10400 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010401 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010402 memcpy(res + rkind * ires,
10403 sbuf + rkind * i,
10404 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010405 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010406 }
10407
10408 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010409 unicode_adjust_maxchar(&u);
10410 if (u == NULL)
10411 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010413
10414 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 if (srelease)
10416 PyMem_FREE(sbuf);
10417 if (release1)
10418 PyMem_FREE(buf1);
10419 if (release2)
10420 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010421 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010423
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (srelease)
10427 PyMem_FREE(sbuf);
10428 if (release1)
10429 PyMem_FREE(buf1);
10430 if (release2)
10431 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010432 return unicode_result_unchanged(self);
10433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 error:
10435 if (srelease && sbuf)
10436 PyMem_FREE(sbuf);
10437 if (release1 && buf1)
10438 PyMem_FREE(buf1);
10439 if (release2 && buf2)
10440 PyMem_FREE(buf2);
10441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442}
10443
10444/* --- Unicode Object Methods --------------------------------------------- */
10445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010446PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448\n\
10449Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010450characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451
10452static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010453unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010455 return fixup(self, fixtitle);
10456}
10457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010458PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010460\n\
10461Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010462have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010463
10464static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010465unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010466{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010467 return fixup(self, fixcapitalize);
10468}
10469
10470#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010471PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473\n\
10474Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476
10477static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010478unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479{
10480 PyObject *list;
10481 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010482 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 /* Split into words */
10485 list = split(self, NULL, -1);
10486 if (!list)
10487 return NULL;
10488
10489 /* Capitalize each word */
10490 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010491 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010492 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (item == NULL)
10494 goto onError;
10495 Py_DECREF(PyList_GET_ITEM(list, i));
10496 PyList_SET_ITEM(list, i, item);
10497 }
10498
10499 /* Join the words to form a new string */
10500 item = PyUnicode_Join(NULL, list);
10501
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010504 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505}
10506#endif
10507
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010508/* Argument converter. Coerces to a single unicode character */
10509
10510static int
10511convert_uc(PyObject *obj, void *addr)
10512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010514 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010515
Benjamin Peterson14339b62009-01-31 16:36:08 +000010516 uniobj = PyUnicode_FromObject(obj);
10517 if (uniobj == NULL) {
10518 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010520 return 0;
10521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010525 Py_DECREF(uniobj);
10526 return 0;
10527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010529 Py_DECREF(uniobj);
10530 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010531}
10532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010536Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010537done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
10539static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010540unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010542 Py_ssize_t marg, left;
10543 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 Py_UCS4 fillchar = ' ';
10545
Victor Stinnere9a29352011-10-01 02:14:59 +020010546 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548
Victor Stinnerc4b49542011-12-11 22:44:26 +010010549 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 return NULL;
10551
Victor Stinnerc4b49542011-12-11 22:44:26 +010010552 if (PyUnicode_GET_LENGTH(self) >= width)
10553 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
Victor Stinnerc4b49542011-12-11 22:44:26 +010010555 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556 left = marg / 2 + (marg & width & 1);
10557
Victor Stinner9310abb2011-10-05 00:59:23 +020010558 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559}
10560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561/* This function assumes that str1 and str2 are readied by the caller. */
10562
Marc-André Lemburge5034372000-08-08 08:04:29 +000010563static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010564unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 int kind1, kind2;
10567 void *data1, *data2;
10568 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 kind1 = PyUnicode_KIND(str1);
10571 kind2 = PyUnicode_KIND(str2);
10572 data1 = PyUnicode_DATA(str1);
10573 data2 = PyUnicode_DATA(str2);
10574 len1 = PyUnicode_GET_LENGTH(str1);
10575 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 for (i = 0; i < len1 && i < len2; ++i) {
10578 Py_UCS4 c1, c2;
10579 c1 = PyUnicode_READ(kind1, data1, i);
10580 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010581
10582 if (c1 != c2)
10583 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010584 }
10585
10586 return (len1 < len2) ? -1 : (len1 != len2);
10587}
10588
Alexander Belopolsky40018472011-02-26 01:02:56 +000010589int
10590PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10593 if (PyUnicode_READY(left) == -1 ||
10594 PyUnicode_READY(right) == -1)
10595 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010596 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010598 PyErr_Format(PyExc_TypeError,
10599 "Can't compare %.100s and %.100s",
10600 left->ob_type->tp_name,
10601 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602 return -1;
10603}
10604
Martin v. Löwis5b222132007-06-10 09:51:05 +000010605int
10606PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 Py_ssize_t i;
10609 int kind;
10610 void *data;
10611 Py_UCS4 chr;
10612
Victor Stinner910337b2011-10-03 03:20:16 +020010613 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (PyUnicode_READY(uni) == -1)
10615 return -1;
10616 kind = PyUnicode_KIND(uni);
10617 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010618 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10620 if (chr != str[i])
10621 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010622 /* This check keeps Python strings that end in '\0' from comparing equal
10623 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010625 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010626 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010627 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010628 return 0;
10629}
10630
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010631
Benjamin Peterson29060642009-01-31 22:14:21 +000010632#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010633 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010634
Alexander Belopolsky40018472011-02-26 01:02:56 +000010635PyObject *
10636PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010637{
10638 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010639
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010640 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10641 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (PyUnicode_READY(left) == -1 ||
10643 PyUnicode_READY(right) == -1)
10644 return NULL;
10645 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10646 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010647 if (op == Py_EQ) {
10648 Py_INCREF(Py_False);
10649 return Py_False;
10650 }
10651 if (op == Py_NE) {
10652 Py_INCREF(Py_True);
10653 return Py_True;
10654 }
10655 }
10656 if (left == right)
10657 result = 0;
10658 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010659 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010660
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010661 /* Convert the return value to a Boolean */
10662 switch (op) {
10663 case Py_EQ:
10664 v = TEST_COND(result == 0);
10665 break;
10666 case Py_NE:
10667 v = TEST_COND(result != 0);
10668 break;
10669 case Py_LE:
10670 v = TEST_COND(result <= 0);
10671 break;
10672 case Py_GE:
10673 v = TEST_COND(result >= 0);
10674 break;
10675 case Py_LT:
10676 v = TEST_COND(result == -1);
10677 break;
10678 case Py_GT:
10679 v = TEST_COND(result == 1);
10680 break;
10681 default:
10682 PyErr_BadArgument();
10683 return NULL;
10684 }
10685 Py_INCREF(v);
10686 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010688
Brian Curtindfc80e32011-08-10 20:28:54 -050010689 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010690}
10691
Alexander Belopolsky40018472011-02-26 01:02:56 +000010692int
10693PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010694{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 int kind1, kind2, kind;
10697 void *buf1, *buf2;
10698 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010699 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010700
10701 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 sub = PyUnicode_FromObject(element);
10703 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010704 PyErr_Format(PyExc_TypeError,
10705 "'in <string>' requires string as left operand, not %s",
10706 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010707 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (PyUnicode_READY(sub) == -1)
10710 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010711
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010713 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010714 Py_DECREF(sub);
10715 return -1;
10716 }
10717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 kind1 = PyUnicode_KIND(str);
10719 kind2 = PyUnicode_KIND(sub);
10720 kind = kind1 > kind2 ? kind1 : kind2;
10721 buf1 = PyUnicode_DATA(str);
10722 buf2 = PyUnicode_DATA(sub);
10723 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010724 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (!buf1) {
10726 Py_DECREF(sub);
10727 return -1;
10728 }
10729 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010730 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 if (!buf2) {
10732 Py_DECREF(sub);
10733 if (kind1 != kind) PyMem_Free(buf1);
10734 return -1;
10735 }
10736 len1 = PyUnicode_GET_LENGTH(str);
10737 len2 = PyUnicode_GET_LENGTH(sub);
10738
10739 switch(kind) {
10740 case PyUnicode_1BYTE_KIND:
10741 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10742 break;
10743 case PyUnicode_2BYTE_KIND:
10744 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10745 break;
10746 case PyUnicode_4BYTE_KIND:
10747 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10748 break;
10749 default:
10750 result = -1;
10751 assert(0);
10752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753
10754 Py_DECREF(str);
10755 Py_DECREF(sub);
10756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (kind1 != kind)
10758 PyMem_Free(buf1);
10759 if (kind2 != kind)
10760 PyMem_Free(buf2);
10761
Guido van Rossum403d68b2000-03-13 15:55:09 +000010762 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010763}
10764
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765/* Concat to string or Unicode object giving a new Unicode object. */
10766
Alexander Belopolsky40018472011-02-26 01:02:56 +000010767PyObject *
10768PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010771 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010772 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
10774 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010777 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
10782 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010783 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010787 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790 }
10791
Victor Stinner488fa492011-12-12 00:01:39 +010010792 u_len = PyUnicode_GET_LENGTH(u);
10793 v_len = PyUnicode_GET_LENGTH(v);
10794 if (u_len > PY_SSIZE_T_MAX - v_len) {
10795 PyErr_SetString(PyExc_OverflowError,
10796 "strings are too large to concat");
10797 goto onError;
10798 }
10799 new_len = u_len + v_len;
10800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010802 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10803 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010806 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010807 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010809 copy_characters(w, 0, u, 0, u_len);
10810 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811 Py_DECREF(u);
10812 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010813 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817 Py_XDECREF(u);
10818 Py_XDECREF(v);
10819 return NULL;
10820}
10821
Walter Dörwald1ab83302007-05-18 17:15:44 +000010822void
Victor Stinner23e56682011-10-03 03:54:37 +020010823PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010824{
Victor Stinner23e56682011-10-03 03:54:37 +020010825 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010826 Py_UCS4 maxchar, maxchar2;
10827 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010828
10829 if (p_left == NULL) {
10830 if (!PyErr_Occurred())
10831 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010832 return;
10833 }
Victor Stinner23e56682011-10-03 03:54:37 +020010834 left = *p_left;
10835 if (right == NULL || !PyUnicode_Check(left)) {
10836 if (!PyErr_Occurred())
10837 PyErr_BadInternalCall();
10838 goto error;
10839 }
10840
Victor Stinnere1335c72011-10-04 20:53:03 +020010841 if (PyUnicode_READY(left))
10842 goto error;
10843 if (PyUnicode_READY(right))
10844 goto error;
10845
Victor Stinner488fa492011-12-12 00:01:39 +010010846 /* Shortcuts */
10847 if (left == unicode_empty) {
10848 Py_DECREF(left);
10849 Py_INCREF(right);
10850 *p_left = right;
10851 return;
10852 }
10853 if (right == unicode_empty)
10854 return;
10855
10856 left_len = PyUnicode_GET_LENGTH(left);
10857 right_len = PyUnicode_GET_LENGTH(right);
10858 if (left_len > PY_SSIZE_T_MAX - right_len) {
10859 PyErr_SetString(PyExc_OverflowError,
10860 "strings are too large to concat");
10861 goto error;
10862 }
10863 new_len = left_len + right_len;
10864
10865 if (unicode_modifiable(left)
10866 && PyUnicode_CheckExact(right)
10867 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010868 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10869 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010870 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010871 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010872 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10873 {
10874 /* append inplace */
10875 if (unicode_resize(p_left, new_len) != 0) {
10876 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10877 * deallocated so it cannot be put back into
10878 * 'variable'. The MemoryError is raised when there
10879 * is no value in 'variable', which might (very
10880 * remotely) be a cause of incompatibilities.
10881 */
10882 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010883 }
Victor Stinner488fa492011-12-12 00:01:39 +010010884 /* copy 'right' into the newly allocated area of 'left' */
10885 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010886 }
Victor Stinner488fa492011-12-12 00:01:39 +010010887 else {
10888 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10889 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10890 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010891
Victor Stinner488fa492011-12-12 00:01:39 +010010892 /* Concat the two Unicode strings */
10893 res = PyUnicode_New(new_len, maxchar);
10894 if (res == NULL)
10895 goto error;
10896 copy_characters(res, 0, left, 0, left_len);
10897 copy_characters(res, left_len, right, 0, right_len);
10898 Py_DECREF(left);
10899 *p_left = res;
10900 }
10901 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010902 return;
10903
10904error:
Victor Stinner488fa492011-12-12 00:01:39 +010010905 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010906}
10907
10908void
10909PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10910{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010911 PyUnicode_Append(pleft, right);
10912 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010913}
10914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010915PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010916 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010918Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010919string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010920interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921
10922static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010923unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010925 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010926 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010927 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 int kind1, kind2, kind;
10930 void *buf1, *buf2;
10931 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
Jesus Ceaac451502011-04-20 17:09:23 +020010933 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10934 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010936
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 kind1 = PyUnicode_KIND(self);
10938 kind2 = PyUnicode_KIND(substring);
10939 kind = kind1 > kind2 ? kind1 : kind2;
10940 buf1 = PyUnicode_DATA(self);
10941 buf2 = PyUnicode_DATA(substring);
10942 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010943 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (!buf1) {
10945 Py_DECREF(substring);
10946 return NULL;
10947 }
10948 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010949 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950 if (!buf2) {
10951 Py_DECREF(substring);
10952 if (kind1 != kind) PyMem_Free(buf1);
10953 return NULL;
10954 }
10955 len1 = PyUnicode_GET_LENGTH(self);
10956 len2 = PyUnicode_GET_LENGTH(substring);
10957
10958 ADJUST_INDICES(start, end, len1);
10959 switch(kind) {
10960 case PyUnicode_1BYTE_KIND:
10961 iresult = ucs1lib_count(
10962 ((Py_UCS1*)buf1) + start, end - start,
10963 buf2, len2, PY_SSIZE_T_MAX
10964 );
10965 break;
10966 case PyUnicode_2BYTE_KIND:
10967 iresult = ucs2lib_count(
10968 ((Py_UCS2*)buf1) + start, end - start,
10969 buf2, len2, PY_SSIZE_T_MAX
10970 );
10971 break;
10972 case PyUnicode_4BYTE_KIND:
10973 iresult = ucs4lib_count(
10974 ((Py_UCS4*)buf1) + start, end - start,
10975 buf2, len2, PY_SSIZE_T_MAX
10976 );
10977 break;
10978 default:
10979 assert(0); iresult = 0;
10980 }
10981
10982 result = PyLong_FromSsize_t(iresult);
10983
10984 if (kind1 != kind)
10985 PyMem_Free(buf1);
10986 if (kind2 != kind)
10987 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010990
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991 return result;
10992}
10993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010994PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010995 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010997Encode S using the codec registered for encoding. Default encoding\n\
10998is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010999handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011000a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11001'xmlcharrefreplace' as well as any other name registered with\n\
11002codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011005unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011007 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 char *encoding = NULL;
11009 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011010
Benjamin Peterson308d6372009-09-18 21:42:35 +000011011 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11012 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011014 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011015}
11016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011017PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019\n\
11020Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011021If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
11023static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011024unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011026 Py_ssize_t i, j, line_pos, src_len, incr;
11027 Py_UCS4 ch;
11028 PyObject *u;
11029 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011031 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011032 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
11034 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036
Antoine Pitrou22425222011-10-04 19:10:51 +020011037 if (PyUnicode_READY(self) == -1)
11038 return NULL;
11039
Thomas Wouters7e474022000-07-16 12:04:32 +000011040 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011041 src_len = PyUnicode_GET_LENGTH(self);
11042 i = j = line_pos = 0;
11043 kind = PyUnicode_KIND(self);
11044 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011045 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011046 for (; i < src_len; i++) {
11047 ch = PyUnicode_READ(kind, src_data, i);
11048 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011049 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011051 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011053 goto overflow;
11054 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011060 goto overflow;
11061 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011063 if (ch == '\n' || ch == '\r')
11064 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011066 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011067 if (!found)
11068 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011069
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011071 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072 if (!u)
11073 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011074 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Antoine Pitroue71d5742011-10-04 15:55:09 +020011076 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Antoine Pitroue71d5742011-10-04 15:55:09 +020011078 for (; i < src_len; i++) {
11079 ch = PyUnicode_READ(kind, src_data, i);
11080 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011081 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011082 incr = tabsize - (line_pos % tabsize);
11083 line_pos += incr;
11084 while (incr--) {
11085 PyUnicode_WRITE(kind, dest_data, j, ' ');
11086 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011087 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011089 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011090 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011091 line_pos++;
11092 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011093 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011094 if (ch == '\n' || ch == '\r')
11095 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011097 }
11098 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011099 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011100
Antoine Pitroue71d5742011-10-04 15:55:09 +020011101 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011102 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11103 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104}
11105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011107 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108\n\
11109Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011110such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111arguments start and end are interpreted as in slice notation.\n\
11112\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011113Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114
11115static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011118 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011119 Py_ssize_t start;
11120 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011121 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Jesus Ceaac451502011-04-20 17:09:23 +020011123 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11124 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 if (PyUnicode_READY(self) == -1)
11128 return NULL;
11129 if (PyUnicode_READY(substring) == -1)
11130 return NULL;
11131
Victor Stinner7931d9a2011-11-04 00:22:48 +010011132 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
11134 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (result == -2)
11137 return NULL;
11138
Christian Heimes217cfd12007-12-02 14:31:20 +000011139 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140}
11141
11142static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011143unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011145 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11146 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149}
11150
Guido van Rossumc2504932007-09-18 19:42:40 +000011151/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011152 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011153static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011154unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155{
Guido van Rossumc2504932007-09-18 19:42:40 +000011156 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011157 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (_PyUnicode_HASH(self) != -1)
11160 return _PyUnicode_HASH(self);
11161 if (PyUnicode_READY(self) == -1)
11162 return -1;
11163 len = PyUnicode_GET_LENGTH(self);
11164
11165 /* The hash function as a macro, gets expanded three times below. */
11166#define HASH(P) \
11167 x = (Py_uhash_t)*P << 7; \
11168 while (--len >= 0) \
11169 x = (1000003*x) ^ (Py_uhash_t)*P++;
11170
11171 switch (PyUnicode_KIND(self)) {
11172 case PyUnicode_1BYTE_KIND: {
11173 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11174 HASH(c);
11175 break;
11176 }
11177 case PyUnicode_2BYTE_KIND: {
11178 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11179 HASH(s);
11180 break;
11181 }
11182 default: {
11183 Py_UCS4 *l;
11184 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11185 "Impossible switch case in unicode_hash");
11186 l = PyUnicode_4BYTE_DATA(self);
11187 HASH(l);
11188 break;
11189 }
11190 }
11191 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11192
Guido van Rossumc2504932007-09-18 19:42:40 +000011193 if (x == -1)
11194 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011196 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011200PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011201 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011203Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
11205static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011208 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011209 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011210 Py_ssize_t start;
11211 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Jesus Ceaac451502011-04-20 17:09:23 +020011213 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11214 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (PyUnicode_READY(self) == -1)
11218 return NULL;
11219 if (PyUnicode_READY(substring) == -1)
11220 return NULL;
11221
Victor Stinner7931d9a2011-11-04 00:22:48 +010011222 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
11224 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 if (result == -2)
11227 return NULL;
11228
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229 if (result < 0) {
11230 PyErr_SetString(PyExc_ValueError, "substring not found");
11231 return NULL;
11232 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011233
Christian Heimes217cfd12007-12-02 14:31:20 +000011234 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235}
11236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011237PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011238 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011240Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011241at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
11243static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011244unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 Py_ssize_t i, length;
11247 int kind;
11248 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249 int cased;
11250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 if (PyUnicode_READY(self) == -1)
11252 return NULL;
11253 length = PyUnicode_GET_LENGTH(self);
11254 kind = PyUnicode_KIND(self);
11255 data = PyUnicode_DATA(self);
11256
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 if (length == 1)
11259 return PyBool_FromLong(
11260 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011262 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011265
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 for (i = 0; i < length; i++) {
11268 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011269
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11271 return PyBool_FromLong(0);
11272 else if (!cased && Py_UNICODE_ISLOWER(ch))
11273 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011275 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276}
11277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011281Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
11284static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011285unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 Py_ssize_t i, length;
11288 int kind;
11289 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 int cased;
11291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 if (PyUnicode_READY(self) == -1)
11293 return NULL;
11294 length = PyUnicode_GET_LENGTH(self);
11295 kind = PyUnicode_KIND(self);
11296 data = PyUnicode_DATA(self);
11297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 if (length == 1)
11300 return PyBool_FromLong(
11301 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011303 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011306
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 for (i = 0; i < length; i++) {
11309 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011310
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11312 return PyBool_FromLong(0);
11313 else if (!cased && Py_UNICODE_ISUPPER(ch))
11314 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011316 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011319PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011320 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011322Return True if S is a titlecased string and there is at least one\n\
11323character in S, i.e. upper- and titlecase characters may only\n\
11324follow uncased characters and lowercase characters only cased ones.\n\
11325Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
11327static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011328unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 Py_ssize_t i, length;
11331 int kind;
11332 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333 int cased, previous_is_cased;
11334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 if (PyUnicode_READY(self) == -1)
11336 return NULL;
11337 length = PyUnicode_GET_LENGTH(self);
11338 kind = PyUnicode_KIND(self);
11339 data = PyUnicode_DATA(self);
11340
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 if (length == 1) {
11343 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11344 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11345 (Py_UNICODE_ISUPPER(ch) != 0));
11346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011348 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011351
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352 cased = 0;
11353 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 for (i = 0; i < length; i++) {
11355 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011356
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11358 if (previous_is_cased)
11359 return PyBool_FromLong(0);
11360 previous_is_cased = 1;
11361 cased = 1;
11362 }
11363 else if (Py_UNICODE_ISLOWER(ch)) {
11364 if (!previous_is_cased)
11365 return PyBool_FromLong(0);
11366 previous_is_cased = 1;
11367 cased = 1;
11368 }
11369 else
11370 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011372 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373}
11374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011375PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011378Return True if all characters in S are whitespace\n\
11379and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380
11381static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011382unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 Py_ssize_t i, length;
11385 int kind;
11386 void *data;
11387
11388 if (PyUnicode_READY(self) == -1)
11389 return NULL;
11390 length = PyUnicode_GET_LENGTH(self);
11391 kind = PyUnicode_KIND(self);
11392 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 if (length == 1)
11396 return PyBool_FromLong(
11397 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011399 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 for (i = 0; i < length; i++) {
11404 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011405 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011406 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011408 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409}
11410
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011411PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011413\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011414Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011416
11417static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011418unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 Py_ssize_t i, length;
11421 int kind;
11422 void *data;
11423
11424 if (PyUnicode_READY(self) == -1)
11425 return NULL;
11426 length = PyUnicode_GET_LENGTH(self);
11427 kind = PyUnicode_KIND(self);
11428 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011429
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011430 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (length == 1)
11432 return PyBool_FromLong(
11433 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011434
11435 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 for (i = 0; i < length; i++) {
11440 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011442 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011443 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011444}
11445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011446PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011448\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011449Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011450and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011451
11452static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011453unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 int kind;
11456 void *data;
11457 Py_ssize_t len, i;
11458
11459 if (PyUnicode_READY(self) == -1)
11460 return NULL;
11461
11462 kind = PyUnicode_KIND(self);
11463 data = PyUnicode_DATA(self);
11464 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011465
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011466 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 if (len == 1) {
11468 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11469 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11470 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011471
11472 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 for (i = 0; i < len; i++) {
11477 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011478 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011480 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011481 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011482}
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011487Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
11490static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011491unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 Py_ssize_t i, length;
11494 int kind;
11495 void *data;
11496
11497 if (PyUnicode_READY(self) == -1)
11498 return NULL;
11499 length = PyUnicode_GET_LENGTH(self);
11500 kind = PyUnicode_KIND(self);
11501 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (length == 1)
11505 return PyBool_FromLong(
11506 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011508 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 for (i = 0; i < length; i++) {
11513 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011516 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011522Return True if all characters in S are digits\n\
11523and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011526unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 Py_ssize_t i, length;
11529 int kind;
11530 void *data;
11531
11532 if (PyUnicode_READY(self) == -1)
11533 return NULL;
11534 length = PyUnicode_GET_LENGTH(self);
11535 kind = PyUnicode_KIND(self);
11536 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 if (length == 1) {
11540 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11541 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011544 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 for (i = 0; i < length; i++) {
11549 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011552 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553}
11554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011555PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011558Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011559False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560
11561static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011562unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 Py_ssize_t i, length;
11565 int kind;
11566 void *data;
11567
11568 if (PyUnicode_READY(self) == -1)
11569 return NULL;
11570 length = PyUnicode_GET_LENGTH(self);
11571 kind = PyUnicode_KIND(self);
11572 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (length == 1)
11576 return PyBool_FromLong(
11577 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011579 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 for (i = 0; i < length; i++) {
11584 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011587 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588}
11589
Martin v. Löwis47383402007-08-15 07:32:56 +000011590int
11591PyUnicode_IsIdentifier(PyObject *self)
11592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 int kind;
11594 void *data;
11595 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011596 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 if (PyUnicode_READY(self) == -1) {
11599 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 }
11602
11603 /* Special case for empty strings */
11604 if (PyUnicode_GET_LENGTH(self) == 0)
11605 return 0;
11606 kind = PyUnicode_KIND(self);
11607 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011608
11609 /* PEP 3131 says that the first character must be in
11610 XID_Start and subsequent characters in XID_Continue,
11611 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011613 letters, digits, underscore). However, given the current
11614 definition of XID_Start and XID_Continue, it is sufficient
11615 to check just for these, except that _ must be allowed
11616 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011618 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011619 return 0;
11620
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011621 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011624 return 1;
11625}
11626
11627PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011629\n\
11630Return True if S is a valid identifier according\n\
11631to the language definition.");
11632
11633static PyObject*
11634unicode_isidentifier(PyObject *self)
11635{
11636 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11637}
11638
Georg Brandl559e5d72008-06-11 18:37:52 +000011639PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011641\n\
11642Return True if all characters in S are considered\n\
11643printable in repr() or S is empty, False otherwise.");
11644
11645static PyObject*
11646unicode_isprintable(PyObject *self)
11647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 Py_ssize_t i, length;
11649 int kind;
11650 void *data;
11651
11652 if (PyUnicode_READY(self) == -1)
11653 return NULL;
11654 length = PyUnicode_GET_LENGTH(self);
11655 kind = PyUnicode_KIND(self);
11656 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011657
11658 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 if (length == 1)
11660 return PyBool_FromLong(
11661 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 for (i = 0; i < length; i++) {
11664 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011665 Py_RETURN_FALSE;
11666 }
11667 }
11668 Py_RETURN_TRUE;
11669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011672 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
11674Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011675iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676
11677static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011678unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011680 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Martin v. Löwis18e16552006-02-15 17:27:45 +000011683static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011684unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 if (PyUnicode_READY(self) == -1)
11687 return -1;
11688 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011691PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011694Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011695done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
11697static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011698unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011700 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 Py_UCS4 fillchar = ' ';
11702
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011703 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 return NULL;
11705
Victor Stinnerc4b49542011-12-11 22:44:26 +010011706 if (PyUnicode_READY(self) < 0)
11707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
Victor Stinnerc4b49542011-12-11 22:44:26 +010011709 if (PyUnicode_GET_LENGTH(self) >= width)
11710 return unicode_result_unchanged(self);
11711
11712 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713}
11714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011715PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011718Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
11720static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011721unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723 return fixup(self, fixlower);
11724}
11725
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726#define LEFTSTRIP 0
11727#define RIGHTSTRIP 1
11728#define BOTHSTRIP 2
11729
11730/* Arrays indexed by above */
11731static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11732
11733#define STRIPNAME(i) (stripformat[i]+3)
11734
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735/* externally visible for str.strip(unicode) */
11736PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 void *data;
11740 int kind;
11741 Py_ssize_t i, j, len;
11742 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11745 return NULL;
11746
11747 kind = PyUnicode_KIND(self);
11748 data = PyUnicode_DATA(self);
11749 len = PyUnicode_GET_LENGTH(self);
11750 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11751 PyUnicode_DATA(sepobj),
11752 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011753
Benjamin Peterson14339b62009-01-31 16:36:08 +000011754 i = 0;
11755 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 while (i < len &&
11757 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 i++;
11759 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011760 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 j = len;
11763 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 do {
11765 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 } while (j >= i &&
11767 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011769 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770
Victor Stinner7931d9a2011-11-04 00:22:48 +010011771 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772}
11773
11774PyObject*
11775PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11776{
11777 unsigned char *data;
11778 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011779 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780
Victor Stinnerde636f32011-10-01 03:55:54 +020011781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783
11784 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11785
Victor Stinner12bab6d2011-10-01 01:53:49 +020011786 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Victor Stinnerc4b49542011-12-11 22:44:26 +010011787 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788
Victor Stinner12bab6d2011-10-01 01:53:49 +020011789 length = end - start;
11790 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011791 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792
Victor Stinnerde636f32011-10-01 03:55:54 +020011793 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011794 PyErr_SetString(PyExc_IndexError, "string index out of range");
11795 return NULL;
11796 }
11797
Victor Stinnerb9275c12011-10-05 14:01:42 +020011798 if (PyUnicode_IS_ASCII(self)) {
11799 kind = PyUnicode_KIND(self);
11800 data = PyUnicode_1BYTE_DATA(self);
11801 return unicode_fromascii(data + start, length);
11802 }
11803 else {
11804 kind = PyUnicode_KIND(self);
11805 data = PyUnicode_1BYTE_DATA(self);
11806 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011807 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011808 length);
11809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811
11812static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011813do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 int kind;
11816 void *data;
11817 Py_ssize_t len, i, j;
11818
11819 if (PyUnicode_READY(self) == -1)
11820 return NULL;
11821
11822 kind = PyUnicode_KIND(self);
11823 data = PyUnicode_DATA(self);
11824 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 i = 0;
11827 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011829 i++;
11830 }
11831 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 j = len;
11834 if (striptype != LEFTSTRIP) {
11835 do {
11836 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011838 j++;
11839 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840
Victor Stinner7931d9a2011-11-04 00:22:48 +010011841 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842}
11843
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844
11845static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011847{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011848 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011849
Benjamin Peterson14339b62009-01-31 16:36:08 +000011850 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11851 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852
Benjamin Peterson14339b62009-01-31 16:36:08 +000011853 if (sep != NULL && sep != Py_None) {
11854 if (PyUnicode_Check(sep))
11855 return _PyUnicode_XStrip(self, striptype, sep);
11856 else {
11857 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 "%s arg must be None or str",
11859 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011860 return NULL;
11861 }
11862 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011863
Benjamin Peterson14339b62009-01-31 16:36:08 +000011864 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011865}
11866
11867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011868PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011870\n\
11871Return a copy of the string S with leading and trailing\n\
11872whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011873If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011874
11875static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011876unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011877{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011878 if (PyTuple_GET_SIZE(args) == 0)
11879 return do_strip(self, BOTHSTRIP); /* Common case */
11880 else
11881 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011882}
11883
11884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011885PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011887\n\
11888Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011889If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011890
11891static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011892unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011893{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011894 if (PyTuple_GET_SIZE(args) == 0)
11895 return do_strip(self, LEFTSTRIP); /* Common case */
11896 else
11897 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011898}
11899
11900
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011901PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011903\n\
11904Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011905If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011906
11907static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011908unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011909{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 if (PyTuple_GET_SIZE(args) == 0)
11911 return do_strip(self, RIGHTSTRIP); /* Common case */
11912 else
11913 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011914}
11915
11916
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011918unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011920 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
Georg Brandl222de0f2009-04-12 12:01:50 +000011923 if (len < 1) {
11924 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011925 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927
Victor Stinnerc4b49542011-12-11 22:44:26 +010011928 /* no repeat, return original string */
11929 if (len == 1)
11930 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011931
Victor Stinnerc4b49542011-12-11 22:44:26 +010011932 if (PyUnicode_READY(str) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 return NULL;
11934
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011935 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011936 PyErr_SetString(PyExc_OverflowError,
11937 "repeated string is too long");
11938 return NULL;
11939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011941
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011942 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 if (!u)
11944 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011945 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 if (PyUnicode_GET_LENGTH(str) == 1) {
11948 const int kind = PyUnicode_KIND(str);
11949 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11950 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011951 if (kind == PyUnicode_1BYTE_KIND)
11952 memset(to, (unsigned char)fill_char, len);
11953 else {
11954 for (n = 0; n < len; ++n)
11955 PyUnicode_WRITE(kind, to, n, fill_char);
11956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 }
11958 else {
11959 /* number of characters copied this far */
11960 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011961 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 char *to = (char *) PyUnicode_DATA(u);
11963 Py_MEMCPY(to, PyUnicode_DATA(str),
11964 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 n = (done <= nchars-done) ? done : nchars-done;
11967 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011968 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 }
11971
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011972 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011973 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Alexander Belopolsky40018472011-02-26 01:02:56 +000011976PyObject *
11977PyUnicode_Replace(PyObject *obj,
11978 PyObject *subobj,
11979 PyObject *replobj,
11980 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
11982 PyObject *self;
11983 PyObject *str1;
11984 PyObject *str2;
11985 PyObject *result;
11986
11987 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011988 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011991 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 Py_DECREF(self);
11993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994 }
11995 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011996 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 Py_DECREF(self);
11998 Py_DECREF(str1);
11999 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 Py_DECREF(self);
12003 Py_DECREF(str1);
12004 Py_DECREF(str2);
12005 return result;
12006}
12007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012008PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012009 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010\n\
12011Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012012old replaced by new. If the optional argument count is\n\
12013given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
12015static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 PyObject *str1;
12019 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012020 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021 PyObject *result;
12022
Martin v. Löwis18e16552006-02-15 17:27:45 +000012023 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012026 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 str1 = PyUnicode_FromObject(str1);
12028 if (str1 == NULL || PyUnicode_READY(str1) == -1)
12029 return NULL;
12030 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020012031 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012032 Py_DECREF(str1);
12033 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
12036 result = replace(self, str1, str2, maxcount);
12037
12038 Py_DECREF(str1);
12039 Py_DECREF(str2);
12040 return result;
12041}
12042
Alexander Belopolsky40018472011-02-26 01:02:56 +000012043static PyObject *
12044unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012046 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012047 Py_ssize_t isize;
12048 Py_ssize_t osize, squote, dquote, i, o;
12049 Py_UCS4 max, quote;
12050 int ikind, okind;
12051 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012053 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012054 return NULL;
12055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 isize = PyUnicode_GET_LENGTH(unicode);
12057 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 /* Compute length of output, quote characters, and
12060 maximum character */
12061 osize = 2; /* quotes */
12062 max = 127;
12063 squote = dquote = 0;
12064 ikind = PyUnicode_KIND(unicode);
12065 for (i = 0; i < isize; i++) {
12066 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12067 switch (ch) {
12068 case '\'': squote++; osize++; break;
12069 case '"': dquote++; osize++; break;
12070 case '\\': case '\t': case '\r': case '\n':
12071 osize += 2; break;
12072 default:
12073 /* Fast-path ASCII */
12074 if (ch < ' ' || ch == 0x7f)
12075 osize += 4; /* \xHH */
12076 else if (ch < 0x7f)
12077 osize++;
12078 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12079 osize++;
12080 max = ch > max ? ch : max;
12081 }
12082 else if (ch < 0x100)
12083 osize += 4; /* \xHH */
12084 else if (ch < 0x10000)
12085 osize += 6; /* \uHHHH */
12086 else
12087 osize += 10; /* \uHHHHHHHH */
12088 }
12089 }
12090
12091 quote = '\'';
12092 if (squote) {
12093 if (dquote)
12094 /* Both squote and dquote present. Use squote,
12095 and escape them */
12096 osize += squote;
12097 else
12098 quote = '"';
12099 }
12100
12101 repr = PyUnicode_New(osize, max);
12102 if (repr == NULL)
12103 return NULL;
12104 okind = PyUnicode_KIND(repr);
12105 odata = PyUnicode_DATA(repr);
12106
12107 PyUnicode_WRITE(okind, odata, 0, quote);
12108 PyUnicode_WRITE(okind, odata, osize-1, quote);
12109
12110 for (i = 0, o = 1; i < isize; i++) {
12111 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012112
12113 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if ((ch == quote) || (ch == '\\')) {
12115 PyUnicode_WRITE(okind, odata, o++, '\\');
12116 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012117 continue;
12118 }
12119
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012121 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 PyUnicode_WRITE(okind, odata, o++, '\\');
12123 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012124 }
12125 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 PyUnicode_WRITE(okind, odata, o++, '\\');
12127 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012128 }
12129 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 PyUnicode_WRITE(okind, odata, o++, '\\');
12131 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012132 }
12133
12134 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012135 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 PyUnicode_WRITE(okind, odata, o++, '\\');
12137 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012138 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12139 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 }
12141
Georg Brandl559e5d72008-06-11 18:37:52 +000012142 /* Copy ASCII characters as-is */
12143 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012145 }
12146
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012148 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012149 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012150 (categories Z* and C* except ASCII space)
12151 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012153 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (ch <= 0xff) {
12155 PyUnicode_WRITE(okind, odata, o++, '\\');
12156 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012157 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12158 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012159 }
12160 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 else if (ch >= 0x10000) {
12162 PyUnicode_WRITE(okind, odata, o++, '\\');
12163 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012164 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12165 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12166 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12167 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12168 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12169 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12170 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12171 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012172 }
12173 /* Map 16-bit characters to '\uxxxx' */
12174 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 PyUnicode_WRITE(okind, odata, o++, '\\');
12176 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012177 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12178 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12179 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12180 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012181 }
12182 }
12183 /* Copy characters as-is */
12184 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012186 }
12187 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012190 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012191 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192}
12193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012194PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012195 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196\n\
12197Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012198such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199arguments start and end are interpreted as in slice notation.\n\
12200\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012201Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
12203static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012206 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012207 Py_ssize_t start;
12208 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012209 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
Jesus Ceaac451502011-04-20 17:09:23 +020012211 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12212 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (PyUnicode_READY(self) == -1)
12216 return NULL;
12217 if (PyUnicode_READY(substring) == -1)
12218 return NULL;
12219
Victor Stinner7931d9a2011-11-04 00:22:48 +010012220 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (result == -2)
12225 return NULL;
12226
Christian Heimes217cfd12007-12-02 14:31:20 +000012227 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228}
12229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012230PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012233Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234
12235static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012238 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012239 Py_ssize_t start;
12240 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012241 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
Jesus Ceaac451502011-04-20 17:09:23 +020012243 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12244 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012247 if (PyUnicode_READY(self) == -1)
12248 return NULL;
12249 if (PyUnicode_READY(substring) == -1)
12250 return NULL;
12251
Victor Stinner7931d9a2011-11-04 00:22:48 +010012252 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
12254 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 if (result == -2)
12257 return NULL;
12258
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 if (result < 0) {
12260 PyErr_SetString(PyExc_ValueError, "substring not found");
12261 return NULL;
12262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263
Christian Heimes217cfd12007-12-02 14:31:20 +000012264 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012267PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012268 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012270Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012271done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272
12273static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012274unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012276 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 Py_UCS4 fillchar = ' ';
12278
Victor Stinnere9a29352011-10-01 02:14:59 +020012279 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012281
Victor Stinnerc4b49542011-12-11 22:44:26 +010012282 if (PyUnicode_READY(self) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 return NULL;
12284
Victor Stinnerc4b49542011-12-11 22:44:26 +010012285 if (PyUnicode_GET_LENGTH(self) >= width)
12286 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
Victor Stinnerc4b49542011-12-11 22:44:26 +010012288 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289}
12290
Alexander Belopolsky40018472011-02-26 01:02:56 +000012291PyObject *
12292PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293{
12294 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012295
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 s = PyUnicode_FromObject(s);
12297 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012298 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 if (sep != NULL) {
12300 sep = PyUnicode_FromObject(sep);
12301 if (sep == NULL) {
12302 Py_DECREF(s);
12303 return NULL;
12304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 }
12306
Victor Stinner9310abb2011-10-05 00:59:23 +020012307 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
12309 Py_DECREF(s);
12310 Py_XDECREF(sep);
12311 return result;
12312}
12313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012314PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316\n\
12317Return a list of the words in S, using sep as the\n\
12318delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012319splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012320whitespace string is a separator and empty strings are\n\
12321removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012322
12323static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012324unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325{
12326 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012327 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
Martin v. Löwis18e16552006-02-15 17:27:45 +000012329 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330 return NULL;
12331
12332 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012335 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012337 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338}
12339
Thomas Wouters477c8d52006-05-27 19:21:47 +000012340PyObject *
12341PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12342{
12343 PyObject* str_obj;
12344 PyObject* sep_obj;
12345 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 int kind1, kind2, kind;
12347 void *buf1 = NULL, *buf2 = NULL;
12348 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012349
12350 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012351 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012353 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012355 Py_DECREF(str_obj);
12356 return NULL;
12357 }
12358
Victor Stinner14f8f022011-10-05 20:58:25 +020012359 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012361 kind = Py_MAX(kind1, kind2);
12362 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012364 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (!buf1)
12366 goto onError;
12367 buf2 = PyUnicode_DATA(sep_obj);
12368 if (kind2 != kind)
12369 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12370 if (!buf2)
12371 goto onError;
12372 len1 = PyUnicode_GET_LENGTH(str_obj);
12373 len2 = PyUnicode_GET_LENGTH(sep_obj);
12374
Victor Stinner14f8f022011-10-05 20:58:25 +020012375 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012377 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12378 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12379 else
12380 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381 break;
12382 case PyUnicode_2BYTE_KIND:
12383 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12384 break;
12385 case PyUnicode_4BYTE_KIND:
12386 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12387 break;
12388 default:
12389 assert(0);
12390 out = 0;
12391 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392
12393 Py_DECREF(sep_obj);
12394 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 if (kind1 != kind)
12396 PyMem_Free(buf1);
12397 if (kind2 != kind)
12398 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012399
12400 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 onError:
12402 Py_DECREF(sep_obj);
12403 Py_DECREF(str_obj);
12404 if (kind1 != kind && buf1)
12405 PyMem_Free(buf1);
12406 if (kind2 != kind && buf2)
12407 PyMem_Free(buf2);
12408 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409}
12410
12411
12412PyObject *
12413PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12414{
12415 PyObject* str_obj;
12416 PyObject* sep_obj;
12417 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 int kind1, kind2, kind;
12419 void *buf1 = NULL, *buf2 = NULL;
12420 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012421
12422 str_obj = PyUnicode_FromObject(str_in);
12423 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012425 sep_obj = PyUnicode_FromObject(sep_in);
12426 if (!sep_obj) {
12427 Py_DECREF(str_obj);
12428 return NULL;
12429 }
12430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 kind1 = PyUnicode_KIND(str_in);
12432 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012433 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 buf1 = PyUnicode_DATA(str_in);
12435 if (kind1 != kind)
12436 buf1 = _PyUnicode_AsKind(str_in, kind);
12437 if (!buf1)
12438 goto onError;
12439 buf2 = PyUnicode_DATA(sep_obj);
12440 if (kind2 != kind)
12441 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12442 if (!buf2)
12443 goto onError;
12444 len1 = PyUnicode_GET_LENGTH(str_obj);
12445 len2 = PyUnicode_GET_LENGTH(sep_obj);
12446
12447 switch(PyUnicode_KIND(str_in)) {
12448 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012449 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12450 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12451 else
12452 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 break;
12454 case PyUnicode_2BYTE_KIND:
12455 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12456 break;
12457 case PyUnicode_4BYTE_KIND:
12458 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12459 break;
12460 default:
12461 assert(0);
12462 out = 0;
12463 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464
12465 Py_DECREF(sep_obj);
12466 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 if (kind1 != kind)
12468 PyMem_Free(buf1);
12469 if (kind2 != kind)
12470 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012471
12472 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 onError:
12474 Py_DECREF(sep_obj);
12475 Py_DECREF(str_obj);
12476 if (kind1 != kind && buf1)
12477 PyMem_Free(buf1);
12478 if (kind2 != kind && buf2)
12479 PyMem_Free(buf2);
12480 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481}
12482
12483PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012485\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012486Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012487the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012488found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012489
12490static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012491unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012492{
Victor Stinner9310abb2011-10-05 00:59:23 +020012493 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012494}
12495
12496PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012497 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012499Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012501separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012502
12503static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012504unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012505{
Victor Stinner9310abb2011-10-05 00:59:23 +020012506 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012507}
12508
Alexander Belopolsky40018472011-02-26 01:02:56 +000012509PyObject *
12510PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012511{
12512 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012513
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012514 s = PyUnicode_FromObject(s);
12515 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012516 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 if (sep != NULL) {
12518 sep = PyUnicode_FromObject(sep);
12519 if (sep == NULL) {
12520 Py_DECREF(s);
12521 return NULL;
12522 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012523 }
12524
Victor Stinner9310abb2011-10-05 00:59:23 +020012525 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012526
12527 Py_DECREF(s);
12528 Py_XDECREF(sep);
12529 return result;
12530}
12531
12532PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012534\n\
12535Return a list of the words in S, using sep as the\n\
12536delimiter string, starting at the end of the string and\n\
12537working to the front. If maxsplit is given, at most maxsplit\n\
12538splits are done. If sep is not specified, any whitespace string\n\
12539is a separator.");
12540
12541static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012542unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012543{
12544 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012545 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012546
Martin v. Löwis18e16552006-02-15 17:27:45 +000012547 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012548 return NULL;
12549
12550 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012551 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012552 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012553 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012554 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012555 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012556}
12557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012558PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560\n\
12561Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012562Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012566unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012568 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012569 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012571 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12572 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 return NULL;
12574
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012575 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576}
12577
12578static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012579PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012581 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582}
12583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012584PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586\n\
12587Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012588and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589
12590static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012591unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 return fixup(self, fixswapcase);
12594}
12595
Georg Brandlceee0772007-11-27 23:48:05 +000012596PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012598\n\
12599Return a translation table usable for str.translate().\n\
12600If there is only one argument, it must be a dictionary mapping Unicode\n\
12601ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012602Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012603If there are two arguments, they must be strings of equal length, and\n\
12604in the resulting dictionary, each character in x will be mapped to the\n\
12605character at the same position in y. If there is a third argument, it\n\
12606must be a string, whose characters will be mapped to None in the result.");
12607
12608static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012609unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012610{
12611 PyObject *x, *y = NULL, *z = NULL;
12612 PyObject *new = NULL, *key, *value;
12613 Py_ssize_t i = 0;
12614 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012615
Georg Brandlceee0772007-11-27 23:48:05 +000012616 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12617 return NULL;
12618 new = PyDict_New();
12619 if (!new)
12620 return NULL;
12621 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 int x_kind, y_kind, z_kind;
12623 void *x_data, *y_data, *z_data;
12624
Georg Brandlceee0772007-11-27 23:48:05 +000012625 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012626 if (!PyUnicode_Check(x)) {
12627 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12628 "be a string if there is a second argument");
12629 goto err;
12630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012632 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12633 "arguments must have equal length");
12634 goto err;
12635 }
12636 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 x_kind = PyUnicode_KIND(x);
12638 y_kind = PyUnicode_KIND(y);
12639 x_data = PyUnicode_DATA(x);
12640 y_data = PyUnicode_DATA(y);
12641 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12642 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12643 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012644 if (!key || !value)
12645 goto err;
12646 res = PyDict_SetItem(new, key, value);
12647 Py_DECREF(key);
12648 Py_DECREF(value);
12649 if (res < 0)
12650 goto err;
12651 }
12652 /* create entries for deleting chars in z */
12653 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 z_kind = PyUnicode_KIND(z);
12655 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012656 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012658 if (!key)
12659 goto err;
12660 res = PyDict_SetItem(new, key, Py_None);
12661 Py_DECREF(key);
12662 if (res < 0)
12663 goto err;
12664 }
12665 }
12666 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 int kind;
12668 void *data;
12669
Georg Brandlceee0772007-11-27 23:48:05 +000012670 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012671 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012672 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12673 "to maketrans it must be a dict");
12674 goto err;
12675 }
12676 /* copy entries into the new dict, converting string keys to int keys */
12677 while (PyDict_Next(x, &i, &key, &value)) {
12678 if (PyUnicode_Check(key)) {
12679 /* convert string keys to integer keys */
12680 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012681 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012682 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12683 "table must be of length 1");
12684 goto err;
12685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 kind = PyUnicode_KIND(key);
12687 data = PyUnicode_DATA(key);
12688 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012689 if (!newkey)
12690 goto err;
12691 res = PyDict_SetItem(new, newkey, value);
12692 Py_DECREF(newkey);
12693 if (res < 0)
12694 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012695 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012696 /* just keep integer keys */
12697 if (PyDict_SetItem(new, key, value) < 0)
12698 goto err;
12699 } else {
12700 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12701 "be strings or integers");
12702 goto err;
12703 }
12704 }
12705 }
12706 return new;
12707 err:
12708 Py_DECREF(new);
12709 return NULL;
12710}
12711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012712PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714\n\
12715Return a copy of the string S, where all characters have been mapped\n\
12716through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012717Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012718Unmapped characters are left untouched. Characters mapped to None\n\
12719are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720
12721static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725}
12726
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012727PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012730Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731
12732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012733unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 return fixup(self, fixupper);
12736}
12737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012738PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012741Pad a numeric string S with zeros on the left, to fill a field\n\
12742of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743
12744static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012745unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012747 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012748 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012749 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 int kind;
12751 void *data;
12752 Py_UCS4 chr;
12753
Martin v. Löwis18e16552006-02-15 17:27:45 +000012754 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755 return NULL;
12756
Victor Stinnerc4b49542011-12-11 22:44:26 +010012757 if (PyUnicode_READY(self) < 0)
12758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759
Victor Stinnerc4b49542011-12-11 22:44:26 +010012760 if (PyUnicode_GET_LENGTH(self) >= width)
12761 return unicode_result_unchanged(self);
12762
12763 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765 u = pad(self, fill, 0, '0');
12766
Walter Dörwald068325e2002-04-15 13:36:47 +000012767 if (u == NULL)
12768 return NULL;
12769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 kind = PyUnicode_KIND(u);
12771 data = PyUnicode_DATA(u);
12772 chr = PyUnicode_READ(kind, data, fill);
12773
12774 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 PyUnicode_WRITE(kind, data, 0, chr);
12777 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778 }
12779
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012780 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012781 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783
12784#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012785static PyObject *
12786unicode__decimal2ascii(PyObject *self)
12787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012789}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790#endif
12791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012792PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012795Return True if S starts with the specified prefix, False otherwise.\n\
12796With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797With optional end, stop comparing S at that position.\n\
12798prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
12800static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012801unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012805 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012806 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012807 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012808 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
Jesus Ceaac451502011-04-20 17:09:23 +020012810 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012812 if (PyTuple_Check(subobj)) {
12813 Py_ssize_t i;
12814 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012815 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012816 if (substring == NULL)
12817 return NULL;
12818 result = tailmatch(self, substring, start, end, -1);
12819 Py_DECREF(substring);
12820 if (result) {
12821 Py_RETURN_TRUE;
12822 }
12823 }
12824 /* nothing matched */
12825 Py_RETURN_FALSE;
12826 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012827 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012828 if (substring == NULL) {
12829 if (PyErr_ExceptionMatches(PyExc_TypeError))
12830 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12831 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012833 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012834 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012836 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837}
12838
12839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012840PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012843Return True if S ends with the specified suffix, False otherwise.\n\
12844With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012845With optional end, stop comparing S at that position.\n\
12846suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847
12848static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012849unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012852 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012853 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012854 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012855 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012856 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857
Jesus Ceaac451502011-04-20 17:09:23 +020012858 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012860 if (PyTuple_Check(subobj)) {
12861 Py_ssize_t i;
12862 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012863 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012865 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012867 result = tailmatch(self, substring, start, end, +1);
12868 Py_DECREF(substring);
12869 if (result) {
12870 Py_RETURN_TRUE;
12871 }
12872 }
12873 Py_RETURN_FALSE;
12874 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012876 if (substring == NULL) {
12877 if (PyErr_ExceptionMatches(PyExc_TypeError))
12878 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12879 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012881 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012882 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012884 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012888
12889PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012891\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012892Return a formatted version of S, using substitutions from args and kwargs.\n\
12893The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012894
Eric Smith27bbca62010-11-04 17:06:58 +000012895PyDoc_STRVAR(format_map__doc__,
12896 "S.format_map(mapping) -> str\n\
12897\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012898Return a formatted version of S, using substitutions from mapping.\n\
12899The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012900
Eric Smith4a7d76d2008-05-30 18:10:19 +000012901static PyObject *
12902unicode__format__(PyObject* self, PyObject* args)
12903{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012904 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012905
12906 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12907 return NULL;
12908
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012909 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012911 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012912}
12913
Eric Smith8c663262007-08-25 02:26:07 +000012914PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012916\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012917Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012918
12919static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012920unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 Py_ssize_t size;
12923
12924 /* If it's a compact object, account for base structure +
12925 character data. */
12926 if (PyUnicode_IS_COMPACT_ASCII(v))
12927 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12928 else if (PyUnicode_IS_COMPACT(v))
12929 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012930 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 else {
12932 /* If it is a two-block object, account for base object, and
12933 for character block if present. */
12934 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012935 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012937 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 }
12939 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012940 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012941 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012943 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012944 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012945
12946 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012947}
12948
12949PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012950 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012951
12952static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012953unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012954{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012955 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 if (!copy)
12957 return NULL;
12958 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012959}
12960
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961static PyMethodDef unicode_methods[] = {
12962
12963 /* Order is according to common usage: often used methods should
12964 appear first, since lookup is done sequentially. */
12965
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012966 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012967 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12968 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012969 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012970 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12971 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12972 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12973 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12974 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12975 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12976 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012978 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12979 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12980 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012981 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012982 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12983 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12984 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012985 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012987 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012988 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012989 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12990 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12991 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12992 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12993 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12994 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12995 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12996 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12997 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12998 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12999 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13000 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13001 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13002 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013003 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013004 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013005 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013006 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013007 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013008 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013009 {"maketrans", (PyCFunction) unicode_maketrans,
13010 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013011 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013012#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013013 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014#endif
13015
13016#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013017 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013018 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019#endif
13020
Benjamin Peterson14339b62009-01-31 16:36:08 +000013021 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022 {NULL, NULL}
13023};
13024
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013025static PyObject *
13026unicode_mod(PyObject *v, PyObject *w)
13027{
Brian Curtindfc80e32011-08-10 20:28:54 -050013028 if (!PyUnicode_Check(v))
13029 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013031}
13032
13033static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013034 0, /*nb_add*/
13035 0, /*nb_subtract*/
13036 0, /*nb_multiply*/
13037 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013038};
13039
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013041 (lenfunc) unicode_length, /* sq_length */
13042 PyUnicode_Concat, /* sq_concat */
13043 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13044 (ssizeargfunc) unicode_getitem, /* sq_item */
13045 0, /* sq_slice */
13046 0, /* sq_ass_item */
13047 0, /* sq_ass_slice */
13048 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049};
13050
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013052unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 if (PyUnicode_READY(self) == -1)
13055 return NULL;
13056
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013057 if (PyIndex_Check(item)) {
13058 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013059 if (i == -1 && PyErr_Occurred())
13060 return NULL;
13061 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013063 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013064 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013065 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013066 PyObject *result;
13067 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013068 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013069 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013073 return NULL;
13074 }
13075
13076 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013077 Py_INCREF(unicode_empty);
13078 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013080 slicelength == PyUnicode_GET_LENGTH(self)) {
13081 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013082 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013083 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013084 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013085 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013086 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013087 src_kind = PyUnicode_KIND(self);
13088 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013089 if (!PyUnicode_IS_ASCII(self)) {
13090 kind_limit = kind_maxchar_limit(src_kind);
13091 max_char = 0;
13092 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13093 ch = PyUnicode_READ(src_kind, src_data, cur);
13094 if (ch > max_char) {
13095 max_char = ch;
13096 if (max_char >= kind_limit)
13097 break;
13098 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013099 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013100 }
Victor Stinner55c99112011-10-13 01:17:06 +020013101 else
13102 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013103 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013104 if (result == NULL)
13105 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013106 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013107 dest_data = PyUnicode_DATA(result);
13108
13109 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013110 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13111 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013112 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013113 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013114 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013115 } else {
13116 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13117 return NULL;
13118 }
13119}
13120
13121static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013122 (lenfunc)unicode_length, /* mp_length */
13123 (binaryfunc)unicode_subscript, /* mp_subscript */
13124 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013125};
13126
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128/* Helpers for PyUnicode_Format() */
13129
13130static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013131getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013133 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 (*p_argidx)++;
13136 if (arglen < 0)
13137 return args;
13138 else
13139 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140 }
13141 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013142 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143 return NULL;
13144}
13145
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013146/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013148static PyObject *
13149formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013151 char *p;
13152 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013154
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155 x = PyFloat_AsDouble(v);
13156 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013157 return NULL;
13158
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013161
Eric Smith0923d1d2009-04-16 20:16:10 +000013162 p = PyOS_double_to_string(x, type, prec,
13163 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013164 if (p == NULL)
13165 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013167 PyMem_Free(p);
13168 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169}
13170
Tim Peters38fd5b62000-09-21 05:43:11 +000013171static PyObject*
13172formatlong(PyObject *val, int flags, int prec, int type)
13173{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 char *buf;
13175 int len;
13176 PyObject *str; /* temporary string object. */
13177 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013178
Benjamin Peterson14339b62009-01-31 16:36:08 +000013179 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13180 if (!str)
13181 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013182 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183 Py_DECREF(str);
13184 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013185}
13186
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013187static Py_UCS4
13188formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013189{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013190 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013191 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013193 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 goto onError;
13196 }
13197 else {
13198 /* Integer input truncated to a character */
13199 long x;
13200 x = PyLong_AsLong(v);
13201 if (x == -1 && PyErr_Occurred())
13202 goto onError;
13203
Victor Stinner8faf8212011-12-08 22:14:11 +010013204 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 PyErr_SetString(PyExc_OverflowError,
13206 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013207 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 }
13209
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013210 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013211 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013212
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013214 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217}
13218
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013219static int
13220repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13221{
13222 int r;
13223 assert(count > 0);
13224 assert(PyUnicode_Check(obj));
13225 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013227 if (repeated == NULL)
13228 return -1;
13229 r = _PyAccu_Accumulate(acc, repeated);
13230 Py_DECREF(repeated);
13231 return r;
13232 }
13233 else {
13234 do {
13235 if (_PyAccu_Accumulate(acc, obj))
13236 return -1;
13237 } while (--count);
13238 return 0;
13239 }
13240}
13241
Alexander Belopolsky40018472011-02-26 01:02:56 +000013242PyObject *
13243PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 void *fmt;
13246 int fmtkind;
13247 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013248 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013249 int r;
13250 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013253 PyObject *temp = NULL;
13254 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013255 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013256 _PyAccu acc;
13257 static PyObject *plus, *minus, *blank, *zero, *percent;
13258
13259 if (!plus && !(plus = get_latin1_char('+')))
13260 return NULL;
13261 if (!minus && !(minus = get_latin1_char('-')))
13262 return NULL;
13263 if (!blank && !(blank = get_latin1_char(' ')))
13264 return NULL;
13265 if (!zero && !(zero = get_latin1_char('0')))
13266 return NULL;
13267 if (!percent && !(percent = get_latin1_char('%')))
13268 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013269
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 PyErr_BadInternalCall();
13272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013274 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013277 if (_PyAccu_Init(&acc))
13278 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 fmt = PyUnicode_DATA(uformat);
13280 fmtkind = PyUnicode_KIND(uformat);
13281 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13282 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 arglen = PyTuple_Size(args);
13286 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287 }
13288 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 arglen = -1;
13290 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013292 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013293 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013294 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
13296 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013298 PyObject *nonfmt;
13299 Py_ssize_t nonfmtpos;
13300 nonfmtpos = fmtpos++;
13301 while (fmtcnt >= 0 &&
13302 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13303 fmtpos++;
13304 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013305 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013306 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 if (nonfmt == NULL)
13308 goto onError;
13309 r = _PyAccu_Accumulate(&acc, nonfmt);
13310 Py_DECREF(nonfmt);
13311 if (r)
13312 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013313 }
13314 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 /* Got a format specifier */
13316 int flags = 0;
13317 Py_ssize_t width = -1;
13318 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013320 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 int isnumok;
13322 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013323 void *pbuf = NULL;
13324 Py_ssize_t pindex, len;
13325 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 fmtpos++;
13328 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13329 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 Py_ssize_t keylen;
13331 PyObject *key;
13332 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013333
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 if (dict == NULL) {
13335 PyErr_SetString(PyExc_TypeError,
13336 "format requires a mapping");
13337 goto onError;
13338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 /* Skip over balanced parentheses */
13343 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013346 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013351 if (fmtcnt < 0 || pcount > 0) {
13352 PyErr_SetString(PyExc_ValueError,
13353 "incomplete format key");
13354 goto onError;
13355 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013356 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013357 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 if (key == NULL)
13359 goto onError;
13360 if (args_owned) {
13361 Py_DECREF(args);
13362 args_owned = 0;
13363 }
13364 args = PyObject_GetItem(dict, key);
13365 Py_DECREF(key);
13366 if (args == NULL) {
13367 goto onError;
13368 }
13369 args_owned = 1;
13370 arglen = -1;
13371 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013372 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 case '-': flags |= F_LJUST; continue;
13376 case '+': flags |= F_SIGN; continue;
13377 case ' ': flags |= F_BLANK; continue;
13378 case '#': flags |= F_ALT; continue;
13379 case '0': flags |= F_ZERO; continue;
13380 }
13381 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 if (c == '*') {
13384 v = getnextarg(args, arglen, &argidx);
13385 if (v == NULL)
13386 goto onError;
13387 if (!PyLong_Check(v)) {
13388 PyErr_SetString(PyExc_TypeError,
13389 "* wants int");
13390 goto onError;
13391 }
13392 width = PyLong_AsLong(v);
13393 if (width == -1 && PyErr_Occurred())
13394 goto onError;
13395 if (width < 0) {
13396 flags |= F_LJUST;
13397 width = -width;
13398 }
13399 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013400 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 }
13402 else if (c >= '0' && c <= '9') {
13403 width = c - '0';
13404 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013405 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013406 if (c < '0' || c > '9')
13407 break;
13408 if ((width*10) / 10 != width) {
13409 PyErr_SetString(PyExc_ValueError,
13410 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013411 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 }
13413 width = width*10 + (c - '0');
13414 }
13415 }
13416 if (c == '.') {
13417 prec = 0;
13418 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 if (c == '*') {
13421 v = getnextarg(args, arglen, &argidx);
13422 if (v == NULL)
13423 goto onError;
13424 if (!PyLong_Check(v)) {
13425 PyErr_SetString(PyExc_TypeError,
13426 "* wants int");
13427 goto onError;
13428 }
13429 prec = PyLong_AsLong(v);
13430 if (prec == -1 && PyErr_Occurred())
13431 goto onError;
13432 if (prec < 0)
13433 prec = 0;
13434 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 }
13437 else if (c >= '0' && c <= '9') {
13438 prec = c - '0';
13439 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 if (c < '0' || c > '9')
13442 break;
13443 if ((prec*10) / 10 != prec) {
13444 PyErr_SetString(PyExc_ValueError,
13445 "prec too big");
13446 goto onError;
13447 }
13448 prec = prec*10 + (c - '0');
13449 }
13450 }
13451 } /* prec */
13452 if (fmtcnt >= 0) {
13453 if (c == 'h' || c == 'l' || c == 'L') {
13454 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013455 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 }
13457 }
13458 if (fmtcnt < 0) {
13459 PyErr_SetString(PyExc_ValueError,
13460 "incomplete format");
13461 goto onError;
13462 }
13463 if (c != '%') {
13464 v = getnextarg(args, arglen, &argidx);
13465 if (v == NULL)
13466 goto onError;
13467 }
13468 sign = 0;
13469 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013470 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013471 switch (c) {
13472
13473 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 _PyAccu_Accumulate(&acc, percent);
13475 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013476
13477 case 's':
13478 case 'r':
13479 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013480 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013481 temp = v;
13482 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013483 }
13484 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 if (c == 's')
13486 temp = PyObject_Str(v);
13487 else if (c == 'r')
13488 temp = PyObject_Repr(v);
13489 else
13490 temp = PyObject_ASCII(v);
13491 if (temp == NULL)
13492 goto onError;
13493 if (PyUnicode_Check(temp))
13494 /* nothing to do */;
13495 else {
13496 Py_DECREF(temp);
13497 PyErr_SetString(PyExc_TypeError,
13498 "%s argument has non-string str()");
13499 goto onError;
13500 }
13501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 if (PyUnicode_READY(temp) == -1) {
13503 Py_CLEAR(temp);
13504 goto onError;
13505 }
13506 pbuf = PyUnicode_DATA(temp);
13507 kind = PyUnicode_KIND(temp);
13508 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 if (prec >= 0 && len > prec)
13510 len = prec;
13511 break;
13512
13513 case 'i':
13514 case 'd':
13515 case 'u':
13516 case 'o':
13517 case 'x':
13518 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 isnumok = 0;
13520 if (PyNumber_Check(v)) {
13521 PyObject *iobj=NULL;
13522
13523 if (PyLong_Check(v)) {
13524 iobj = v;
13525 Py_INCREF(iobj);
13526 }
13527 else {
13528 iobj = PyNumber_Long(v);
13529 }
13530 if (iobj!=NULL) {
13531 if (PyLong_Check(iobj)) {
13532 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013533 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 Py_DECREF(iobj);
13535 if (!temp)
13536 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013537 if (PyUnicode_READY(temp) == -1) {
13538 Py_CLEAR(temp);
13539 goto onError;
13540 }
13541 pbuf = PyUnicode_DATA(temp);
13542 kind = PyUnicode_KIND(temp);
13543 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 sign = 1;
13545 }
13546 else {
13547 Py_DECREF(iobj);
13548 }
13549 }
13550 }
13551 if (!isnumok) {
13552 PyErr_Format(PyExc_TypeError,
13553 "%%%c format: a number is required, "
13554 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13555 goto onError;
13556 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013557 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013559 fillobj = zero;
13560 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 break;
13562
13563 case 'e':
13564 case 'E':
13565 case 'f':
13566 case 'F':
13567 case 'g':
13568 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013569 temp = formatfloat(v, flags, prec, c);
13570 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 if (PyUnicode_READY(temp) == -1) {
13573 Py_CLEAR(temp);
13574 goto onError;
13575 }
13576 pbuf = PyUnicode_DATA(temp);
13577 kind = PyUnicode_KIND(temp);
13578 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013582 fillobj = zero;
13583 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 break;
13585
13586 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013587 {
13588 Py_UCS4 ch = formatchar(v);
13589 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013591 temp = _PyUnicode_FromUCS4(&ch, 1);
13592 if (temp == NULL)
13593 goto onError;
13594 pbuf = PyUnicode_DATA(temp);
13595 kind = PyUnicode_KIND(temp);
13596 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013599
13600 default:
13601 PyErr_Format(PyExc_ValueError,
13602 "unsupported format character '%c' (0x%x) "
13603 "at index %zd",
13604 (31<=c && c<=126) ? (char)c : '?',
13605 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 goto onError;
13608 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609 /* pbuf is initialized here. */
13610 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013612 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13613 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013615 pindex++;
13616 }
13617 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13618 signobj = plus;
13619 len--;
13620 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 }
13622 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013623 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 else
13627 sign = 0;
13628 }
13629 if (width < len)
13630 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013632 if (fill != ' ') {
13633 assert(signobj != NULL);
13634 if (_PyAccu_Accumulate(&acc, signobj))
13635 goto onError;
13636 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 if (width > len)
13638 width--;
13639 }
13640 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013641 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013643 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013644 second = get_latin1_char(
13645 PyUnicode_READ(kind, pbuf, pindex + 1));
13646 pindex += 2;
13647 if (second == NULL ||
13648 _PyAccu_Accumulate(&acc, zero) ||
13649 _PyAccu_Accumulate(&acc, second))
13650 goto onError;
13651 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 width -= 2;
13654 if (width < 0)
13655 width = 0;
13656 len -= 2;
13657 }
13658 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013659 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013660 if (repeat_accumulate(&acc, fillobj, width - len))
13661 goto onError;
13662 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 }
13664 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013665 if (sign) {
13666 assert(signobj != NULL);
13667 if (_PyAccu_Accumulate(&acc, signobj))
13668 goto onError;
13669 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13672 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013673 second = get_latin1_char(
13674 PyUnicode_READ(kind, pbuf, pindex + 1));
13675 pindex += 2;
13676 if (second == NULL ||
13677 _PyAccu_Accumulate(&acc, zero) ||
13678 _PyAccu_Accumulate(&acc, second))
13679 goto onError;
13680 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013681 }
13682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013683 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013684 if (temp != NULL) {
13685 assert(pbuf == PyUnicode_DATA(temp));
13686 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013688 else {
13689 const char *p = (const char *) pbuf;
13690 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013691 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013692 v = PyUnicode_FromKindAndData(kind, p, len);
13693 }
13694 if (v == NULL)
13695 goto onError;
13696 r = _PyAccu_Accumulate(&acc, v);
13697 Py_DECREF(v);
13698 if (r)
13699 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013700 if (width > len && repeat_accumulate(&acc, blank, width - len))
13701 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 if (dict && (argidx < arglen) && c != '%') {
13703 PyErr_SetString(PyExc_TypeError,
13704 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 goto onError;
13706 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013707 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709 } /* until end */
13710 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 PyErr_SetString(PyExc_TypeError,
13712 "not all arguments converted during string formatting");
13713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714 }
13715
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013716 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013717 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013719 }
13720 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013721 Py_XDECREF(temp);
13722 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013723 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013724
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013727 Py_XDECREF(temp);
13728 Py_XDECREF(second);
13729 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013732 }
13733 return NULL;
13734}
13735
Jeremy Hylton938ace62002-07-17 16:30:39 +000013736static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013737unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13738
Tim Peters6d6c1a32001-08-02 04:15:00 +000013739static PyObject *
13740unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13741{
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013743 static char *kwlist[] = {"object", "encoding", "errors", 0};
13744 char *encoding = NULL;
13745 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013746
Benjamin Peterson14339b62009-01-31 16:36:08 +000013747 if (type != &PyUnicode_Type)
13748 return unicode_subtype_new(type, args, kwds);
13749 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013750 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013751 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013752 if (x == NULL) {
13753 Py_INCREF(unicode_empty);
13754 return unicode_empty;
13755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013756 if (encoding == NULL && errors == NULL)
13757 return PyObject_Str(x);
13758 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013759 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013760}
13761
Guido van Rossume023fe02001-08-30 03:12:59 +000013762static PyObject *
13763unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13764{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013765 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013766 Py_ssize_t length, char_size;
13767 int share_wstr, share_utf8;
13768 unsigned int kind;
13769 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013770
Benjamin Peterson14339b62009-01-31 16:36:08 +000013771 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013772
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013773 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013774 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013775 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013776 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013777 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013778 return NULL;
13779
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013780 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013781 if (self == NULL) {
13782 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013783 return NULL;
13784 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013785 kind = PyUnicode_KIND(unicode);
13786 length = PyUnicode_GET_LENGTH(unicode);
13787
13788 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013789#ifdef Py_DEBUG
13790 _PyUnicode_HASH(self) = -1;
13791#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013792 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013793#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013794 _PyUnicode_STATE(self).interned = 0;
13795 _PyUnicode_STATE(self).kind = kind;
13796 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013797 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013798 _PyUnicode_STATE(self).ready = 1;
13799 _PyUnicode_WSTR(self) = NULL;
13800 _PyUnicode_UTF8_LENGTH(self) = 0;
13801 _PyUnicode_UTF8(self) = NULL;
13802 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013803 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013804
13805 share_utf8 = 0;
13806 share_wstr = 0;
13807 if (kind == PyUnicode_1BYTE_KIND) {
13808 char_size = 1;
13809 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13810 share_utf8 = 1;
13811 }
13812 else if (kind == PyUnicode_2BYTE_KIND) {
13813 char_size = 2;
13814 if (sizeof(wchar_t) == 2)
13815 share_wstr = 1;
13816 }
13817 else {
13818 assert(kind == PyUnicode_4BYTE_KIND);
13819 char_size = 4;
13820 if (sizeof(wchar_t) == 4)
13821 share_wstr = 1;
13822 }
13823
13824 /* Ensure we won't overflow the length. */
13825 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13826 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013828 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013829 data = PyObject_MALLOC((length + 1) * char_size);
13830 if (data == NULL) {
13831 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013832 goto onError;
13833 }
13834
Victor Stinnerc3c74152011-10-02 20:39:55 +020013835 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013836 if (share_utf8) {
13837 _PyUnicode_UTF8_LENGTH(self) = length;
13838 _PyUnicode_UTF8(self) = data;
13839 }
13840 if (share_wstr) {
13841 _PyUnicode_WSTR_LENGTH(self) = length;
13842 _PyUnicode_WSTR(self) = (wchar_t *)data;
13843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013844
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013845 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013846 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013847 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013848#ifdef Py_DEBUG
13849 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13850#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013851 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013852 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013853
13854onError:
13855 Py_DECREF(unicode);
13856 Py_DECREF(self);
13857 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013858}
13859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013860PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013862\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013863Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013864encoding defaults to the current default string encoding.\n\
13865errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013866
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013867static PyObject *unicode_iter(PyObject *seq);
13868
Guido van Rossumd57fd912000-03-10 22:53:23 +000013869PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013870 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 "str", /* tp_name */
13872 sizeof(PyUnicodeObject), /* tp_size */
13873 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013874 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013875 (destructor)unicode_dealloc, /* tp_dealloc */
13876 0, /* tp_print */
13877 0, /* tp_getattr */
13878 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013879 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 unicode_repr, /* tp_repr */
13881 &unicode_as_number, /* tp_as_number */
13882 &unicode_as_sequence, /* tp_as_sequence */
13883 &unicode_as_mapping, /* tp_as_mapping */
13884 (hashfunc) unicode_hash, /* tp_hash*/
13885 0, /* tp_call*/
13886 (reprfunc) unicode_str, /* tp_str */
13887 PyObject_GenericGetAttr, /* tp_getattro */
13888 0, /* tp_setattro */
13889 0, /* tp_as_buffer */
13890 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013891 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 unicode_doc, /* tp_doc */
13893 0, /* tp_traverse */
13894 0, /* tp_clear */
13895 PyUnicode_RichCompare, /* tp_richcompare */
13896 0, /* tp_weaklistoffset */
13897 unicode_iter, /* tp_iter */
13898 0, /* tp_iternext */
13899 unicode_methods, /* tp_methods */
13900 0, /* tp_members */
13901 0, /* tp_getset */
13902 &PyBaseObject_Type, /* tp_base */
13903 0, /* tp_dict */
13904 0, /* tp_descr_get */
13905 0, /* tp_descr_set */
13906 0, /* tp_dictoffset */
13907 0, /* tp_init */
13908 0, /* tp_alloc */
13909 unicode_new, /* tp_new */
13910 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911};
13912
13913/* Initialize the Unicode implementation */
13914
Victor Stinner3a50e702011-10-18 21:21:00 +020013915int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013916{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013917 int i;
13918
Thomas Wouters477c8d52006-05-27 19:21:47 +000013919 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013921 0x000A, /* LINE FEED */
13922 0x000D, /* CARRIAGE RETURN */
13923 0x001C, /* FILE SEPARATOR */
13924 0x001D, /* GROUP SEPARATOR */
13925 0x001E, /* RECORD SEPARATOR */
13926 0x0085, /* NEXT LINE */
13927 0x2028, /* LINE SEPARATOR */
13928 0x2029, /* PARAGRAPH SEPARATOR */
13929 };
13930
Fred Drakee4315f52000-05-09 19:53:39 +000013931 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013932 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013933 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013935 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013936
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013937 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013939 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013941
13942 /* initialize the linebreak bloom filter */
13943 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013944 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013945 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013946
13947 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013948
13949#ifdef HAVE_MBCS
13950 winver.dwOSVersionInfoSize = sizeof(winver);
13951 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13952 PyErr_SetFromWindowsErr(0);
13953 return -1;
13954 }
13955#endif
13956 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013957}
13958
13959/* Finalize the Unicode implementation */
13960
Christian Heimesa156e092008-02-16 07:38:31 +000013961int
13962PyUnicode_ClearFreeList(void)
13963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013964 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013965}
13966
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967void
Thomas Wouters78890102000-07-22 19:25:51 +000013968_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013969{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013970 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013972 Py_XDECREF(unicode_empty);
13973 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013974
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013975 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 if (unicode_latin1[i]) {
13977 Py_DECREF(unicode_latin1[i]);
13978 unicode_latin1[i] = NULL;
13979 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013980 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013981 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013982 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013983}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013984
Walter Dörwald16807132007-05-25 13:52:07 +000013985void
13986PyUnicode_InternInPlace(PyObject **p)
13987{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013988 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013990#ifdef Py_DEBUG
13991 assert(s != NULL);
13992 assert(_PyUnicode_CHECK(s));
13993#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013994 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013995 return;
13996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013997 /* If it's a subclass, we don't really know what putting
13998 it in the interned dict might do. */
13999 if (!PyUnicode_CheckExact(s))
14000 return;
14001 if (PyUnicode_CHECK_INTERNED(s))
14002 return;
14003 if (interned == NULL) {
14004 interned = PyDict_New();
14005 if (interned == NULL) {
14006 PyErr_Clear(); /* Don't leave an exception */
14007 return;
14008 }
14009 }
14010 /* It might be that the GetItem call fails even
14011 though the key is present in the dictionary,
14012 namely when this happens during a stack overflow. */
14013 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014014 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014016
Benjamin Peterson29060642009-01-31 22:14:21 +000014017 if (t) {
14018 Py_INCREF(t);
14019 Py_DECREF(*p);
14020 *p = t;
14021 return;
14022 }
Walter Dörwald16807132007-05-25 13:52:07 +000014023
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014025 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 PyErr_Clear();
14027 PyThreadState_GET()->recursion_critical = 0;
14028 return;
14029 }
14030 PyThreadState_GET()->recursion_critical = 0;
14031 /* The two references in interned are not counted by refcnt.
14032 The deallocator will take care of this */
14033 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014035}
14036
14037void
14038PyUnicode_InternImmortal(PyObject **p)
14039{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 PyUnicode_InternInPlace(p);
14041 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014042 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 Py_INCREF(*p);
14044 }
Walter Dörwald16807132007-05-25 13:52:07 +000014045}
14046
14047PyObject *
14048PyUnicode_InternFromString(const char *cp)
14049{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 PyObject *s = PyUnicode_FromString(cp);
14051 if (s == NULL)
14052 return NULL;
14053 PyUnicode_InternInPlace(&s);
14054 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014055}
14056
Alexander Belopolsky40018472011-02-26 01:02:56 +000014057void
14058_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014059{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014061 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 Py_ssize_t i, n;
14063 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014064
Benjamin Peterson14339b62009-01-31 16:36:08 +000014065 if (interned == NULL || !PyDict_Check(interned))
14066 return;
14067 keys = PyDict_Keys(interned);
14068 if (keys == NULL || !PyList_Check(keys)) {
14069 PyErr_Clear();
14070 return;
14071 }
Walter Dörwald16807132007-05-25 13:52:07 +000014072
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14074 detector, interned unicode strings are not forcibly deallocated;
14075 rather, we give them their stolen references back, and then clear
14076 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014077
Benjamin Peterson14339b62009-01-31 16:36:08 +000014078 n = PyList_GET_SIZE(keys);
14079 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014080 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014082 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014083 if (PyUnicode_READY(s) == -1) {
14084 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014085 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014087 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 case SSTATE_NOT_INTERNED:
14089 /* XXX Shouldn't happen */
14090 break;
14091 case SSTATE_INTERNED_IMMORTAL:
14092 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014093 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 break;
14095 case SSTATE_INTERNED_MORTAL:
14096 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014097 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 break;
14099 default:
14100 Py_FatalError("Inconsistent interned string state.");
14101 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 }
14104 fprintf(stderr, "total size of all interned strings: "
14105 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14106 "mortal/immortal\n", mortal_size, immortal_size);
14107 Py_DECREF(keys);
14108 PyDict_Clear(interned);
14109 Py_DECREF(interned);
14110 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014111}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112
14113
14114/********************* Unicode Iterator **************************/
14115
14116typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 PyObject_HEAD
14118 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014119 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014120} unicodeiterobject;
14121
14122static void
14123unicodeiter_dealloc(unicodeiterobject *it)
14124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014125 _PyObject_GC_UNTRACK(it);
14126 Py_XDECREF(it->it_seq);
14127 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014128}
14129
14130static int
14131unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14132{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014133 Py_VISIT(it->it_seq);
14134 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014135}
14136
14137static PyObject *
14138unicodeiter_next(unicodeiterobject *it)
14139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014140 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014141
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 assert(it != NULL);
14143 seq = it->it_seq;
14144 if (seq == NULL)
14145 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014146 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014148 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14149 int kind = PyUnicode_KIND(seq);
14150 void *data = PyUnicode_DATA(seq);
14151 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14152 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014153 if (item != NULL)
14154 ++it->it_index;
14155 return item;
14156 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014157
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 Py_DECREF(seq);
14159 it->it_seq = NULL;
14160 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014161}
14162
14163static PyObject *
14164unicodeiter_len(unicodeiterobject *it)
14165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 Py_ssize_t len = 0;
14167 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014168 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014169 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014170}
14171
14172PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14173
14174static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014175 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014176 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014178};
14179
14180PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14182 "str_iterator", /* tp_name */
14183 sizeof(unicodeiterobject), /* tp_basicsize */
14184 0, /* tp_itemsize */
14185 /* methods */
14186 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14187 0, /* tp_print */
14188 0, /* tp_getattr */
14189 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014190 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 0, /* tp_repr */
14192 0, /* tp_as_number */
14193 0, /* tp_as_sequence */
14194 0, /* tp_as_mapping */
14195 0, /* tp_hash */
14196 0, /* tp_call */
14197 0, /* tp_str */
14198 PyObject_GenericGetAttr, /* tp_getattro */
14199 0, /* tp_setattro */
14200 0, /* tp_as_buffer */
14201 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14202 0, /* tp_doc */
14203 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14204 0, /* tp_clear */
14205 0, /* tp_richcompare */
14206 0, /* tp_weaklistoffset */
14207 PyObject_SelfIter, /* tp_iter */
14208 (iternextfunc)unicodeiter_next, /* tp_iternext */
14209 unicodeiter_methods, /* tp_methods */
14210 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014211};
14212
14213static PyObject *
14214unicode_iter(PyObject *seq)
14215{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014216 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014217
Benjamin Peterson14339b62009-01-31 16:36:08 +000014218 if (!PyUnicode_Check(seq)) {
14219 PyErr_BadInternalCall();
14220 return NULL;
14221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014222 if (PyUnicode_READY(seq) == -1)
14223 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014224 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14225 if (it == NULL)
14226 return NULL;
14227 it->it_index = 0;
14228 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014229 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 _PyObject_GC_TRACK(it);
14231 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014232}
14233
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014234
14235size_t
14236Py_UNICODE_strlen(const Py_UNICODE *u)
14237{
14238 int res = 0;
14239 while(*u++)
14240 res++;
14241 return res;
14242}
14243
14244Py_UNICODE*
14245Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14246{
14247 Py_UNICODE *u = s1;
14248 while ((*u++ = *s2++));
14249 return s1;
14250}
14251
14252Py_UNICODE*
14253Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14254{
14255 Py_UNICODE *u = s1;
14256 while ((*u++ = *s2++))
14257 if (n-- == 0)
14258 break;
14259 return s1;
14260}
14261
14262Py_UNICODE*
14263Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14264{
14265 Py_UNICODE *u1 = s1;
14266 u1 += Py_UNICODE_strlen(u1);
14267 Py_UNICODE_strcpy(u1, s2);
14268 return s1;
14269}
14270
14271int
14272Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14273{
14274 while (*s1 && *s2 && *s1 == *s2)
14275 s1++, s2++;
14276 if (*s1 && *s2)
14277 return (*s1 < *s2) ? -1 : +1;
14278 if (*s1)
14279 return 1;
14280 if (*s2)
14281 return -1;
14282 return 0;
14283}
14284
14285int
14286Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14287{
14288 register Py_UNICODE u1, u2;
14289 for (; n != 0; n--) {
14290 u1 = *s1;
14291 u2 = *s2;
14292 if (u1 != u2)
14293 return (u1 < u2) ? -1 : +1;
14294 if (u1 == '\0')
14295 return 0;
14296 s1++;
14297 s2++;
14298 }
14299 return 0;
14300}
14301
14302Py_UNICODE*
14303Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14304{
14305 const Py_UNICODE *p;
14306 for (p = s; *p; p++)
14307 if (*p == c)
14308 return (Py_UNICODE*)p;
14309 return NULL;
14310}
14311
14312Py_UNICODE*
14313Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14314{
14315 const Py_UNICODE *p;
14316 p = s + Py_UNICODE_strlen(s);
14317 while (p != s) {
14318 p--;
14319 if (*p == c)
14320 return (Py_UNICODE*)p;
14321 }
14322 return NULL;
14323}
Victor Stinner331ea922010-08-10 16:37:20 +000014324
Victor Stinner71133ff2010-09-01 23:43:53 +000014325Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014326PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014327{
Victor Stinner577db2c2011-10-11 22:12:48 +020014328 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014329 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014331 if (!PyUnicode_Check(unicode)) {
14332 PyErr_BadArgument();
14333 return NULL;
14334 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014335 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014336 if (u == NULL)
14337 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014338 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014339 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014340 PyErr_NoMemory();
14341 return NULL;
14342 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014343 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014344 size *= sizeof(Py_UNICODE);
14345 copy = PyMem_Malloc(size);
14346 if (copy == NULL) {
14347 PyErr_NoMemory();
14348 return NULL;
14349 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014350 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014351 return copy;
14352}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014353
Georg Brandl66c221e2010-10-14 07:04:07 +000014354/* A _string module, to export formatter_parser and formatter_field_name_split
14355 to the string.Formatter class implemented in Python. */
14356
14357static PyMethodDef _string_methods[] = {
14358 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14359 METH_O, PyDoc_STR("split the argument as a field name")},
14360 {"formatter_parser", (PyCFunction) formatter_parser,
14361 METH_O, PyDoc_STR("parse the argument as a format string")},
14362 {NULL, NULL}
14363};
14364
14365static struct PyModuleDef _string_module = {
14366 PyModuleDef_HEAD_INIT,
14367 "_string",
14368 PyDoc_STR("string helper module"),
14369 0,
14370 _string_methods,
14371 NULL,
14372 NULL,
14373 NULL,
14374 NULL
14375};
14376
14377PyMODINIT_FUNC
14378PyInit__string(void)
14379{
14380 return PyModule_Create(&_string_module);
14381}
14382
14383
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014384#ifdef __cplusplus
14385}
14386#endif