blob: 9670ae84fc394101e0a4619b9a02819b62ef61e5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinner3a50e702011-10-18 21:21:00 +0200490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
Thomas Wouters477c8d52006-05-27 19:21:47 +0000494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497 to keep things simple, we use a single bitmask, using the least 5
498 bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
Antoine Pitrouf068f942010-01-13 14:19:12 +0000502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518
Benjamin Peterson29060642009-01-31 22:14:21 +0000519#define BLOOM_LINEBREAK(ch) \
520 ((ch) < 128U ? ascii_linebreak[(ch)] : \
521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522
Alexander Belopolsky40018472011-02-26 01:02:56 +0000523Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525{
526 /* calculate simple bloom-style bitmask for a given unicode string */
527
Antoine Pitrouf068f942010-01-13 14:19:12 +0000528 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 Py_ssize_t i;
530
531 mask = 0;
532 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534
535 return mask;
536}
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#define BLOOM_MEMBER(mask, chr, str) \
539 (BLOOM(mask, chr) \
540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100588#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590/* --- Unicode Object ----------------------------------------------------- */
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596 Py_ssize_t size, Py_UCS4 ch,
597 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601 switch (kind) {
602 case PyUnicode_1BYTE_KIND:
603 {
604 Py_UCS1 ch1 = (Py_UCS1) ch;
605 if (ch1 == ch)
606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607 else
608 return -1;
609 }
610 case PyUnicode_2BYTE_KIND:
611 {
612 Py_UCS2 ch2 = (Py_UCS2) ch;
613 if (ch2 == ch)
614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615 else
616 return -1;
617 }
618 case PyUnicode_4BYTE_KIND:
619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620 default:
621 assert(0);
622 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624}
625
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629 Py_ssize_t char_size;
630 Py_ssize_t struct_size;
631 Py_ssize_t new_size;
632 int share_wstr;
633
634 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200635 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636 if (PyUnicode_IS_COMPACT_ASCII(unicode))
637 struct_size = sizeof(PyASCIIObject);
638 else
639 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200640 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200641
642 _Py_DEC_REFTOTAL;
643 _Py_ForgetReference(unicode);
644
645 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
646 PyErr_NoMemory();
647 return NULL;
648 }
649 new_size = (struct_size + (length + 1) * char_size);
650
651 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
652 if (unicode == NULL) {
653 PyObject_Del(unicode);
654 PyErr_NoMemory();
655 return NULL;
656 }
657 _Py_NewReference(unicode);
658 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200659 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200661 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
662 _PyUnicode_WSTR_LENGTH(unicode) = length;
663 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200664 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
665 length, 0);
666 return unicode;
667}
668
Alexander Belopolsky40018472011-02-26 01:02:56 +0000669static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200670resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671{
Victor Stinner95663112011-10-04 01:03:50 +0200672 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200673 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000675
Victor Stinner95663112011-10-04 01:03:50 +0200676 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200677
678 if (PyUnicode_IS_READY(unicode)) {
679 Py_ssize_t char_size;
680 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200681 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 void *data;
683
684 data = _PyUnicode_DATA_ANY(unicode);
685 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200686 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200687 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
688 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200689 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
690 {
691 PyObject_DEL(_PyUnicode_UTF8(unicode));
692 _PyUnicode_UTF8(unicode) = NULL;
693 _PyUnicode_UTF8_LENGTH(unicode) = 0;
694 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695
696 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
697 PyErr_NoMemory();
698 return -1;
699 }
700 new_size = (length + 1) * char_size;
701
702 data = (PyObject *)PyObject_REALLOC(data, new_size);
703 if (data == NULL) {
704 PyErr_NoMemory();
705 return -1;
706 }
707 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200708 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200710 _PyUnicode_WSTR_LENGTH(unicode) = length;
711 }
712 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200713 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 _PyUnicode_UTF8_LENGTH(unicode) = length;
715 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 _PyUnicode_LENGTH(unicode) = length;
717 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200718 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200719 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 }
Victor Stinner95663112011-10-04 01:03:50 +0200723 assert(_PyUnicode_WSTR(unicode) != NULL);
724
725 /* check for integer overflow */
726 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
727 PyErr_NoMemory();
728 return -1;
729 }
730 wstr = _PyUnicode_WSTR(unicode);
731 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
732 if (!wstr) {
733 PyErr_NoMemory();
734 return -1;
735 }
736 _PyUnicode_WSTR(unicode) = wstr;
737 _PyUnicode_WSTR(unicode)[length] = 0;
738 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200739 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 return 0;
741}
742
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743static PyObject*
744resize_copy(PyObject *unicode, Py_ssize_t length)
745{
746 Py_ssize_t copy_length;
747 if (PyUnicode_IS_COMPACT(unicode)) {
748 PyObject *copy;
749 assert(PyUnicode_IS_READY(unicode));
750
751 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
752 if (copy == NULL)
753 return NULL;
754
755 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200756 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200758 }
759 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200760 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 assert(_PyUnicode_WSTR(unicode) != NULL);
762 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200763 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 if (w == NULL)
765 return NULL;
766 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
767 copy_length = Py_MIN(copy_length, length);
768 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
769 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200770 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 }
772}
773
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000775 Ux0000 terminated; some code (e.g. new_identifier)
776 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777
778 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000779 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780
781*/
782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200783#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200784static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200785#endif
786
Alexander Belopolsky40018472011-02-26 01:02:56 +0000787static PyUnicodeObject *
788_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000789{
790 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200791 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792
Thomas Wouters477c8d52006-05-27 19:21:47 +0000793 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 if (length == 0 && unicode_empty != NULL) {
795 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200796 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 }
798
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000799 /* Ensure we won't overflow the size. */
800 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
801 return (PyUnicodeObject *)PyErr_NoMemory();
802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200803 if (length < 0) {
804 PyErr_SetString(PyExc_SystemError,
805 "Negative size passed to _PyUnicode_New");
806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809#ifdef Py_DEBUG
810 ++unicode_old_new_calls;
811#endif
812
813 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
814 if (unicode == NULL)
815 return NULL;
816 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
817 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
818 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000819 PyErr_NoMemory();
820 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822
Jeremy Hyltond8082792003-09-16 19:41:39 +0000823 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000824 * the caller fails before initializing str -- unicode_resize()
825 * reads str[0], and the Keep-Alive optimization can keep memory
826 * allocated for str alive across a call to unicode_dealloc(unicode).
827 * We don't want unicode_resize to read uninitialized memory in
828 * that case.
829 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 _PyUnicode_WSTR(unicode)[0] = 0;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
833 _PyUnicode_HASH(unicode) = -1;
834 _PyUnicode_STATE(unicode).interned = 0;
835 _PyUnicode_STATE(unicode).kind = 0;
836 _PyUnicode_STATE(unicode).compact = 0;
837 _PyUnicode_STATE(unicode).ready = 0;
838 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200839 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200841 _PyUnicode_UTF8(unicode) = NULL;
842 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100843 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000844 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000845
Benjamin Peterson29060642009-01-31 22:14:21 +0000846 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000847 /* XXX UNREF/NEWREF interface should be more symmetrical */
848 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000849 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000850 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000851 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000852}
853
Victor Stinnerf42dc442011-10-02 23:33:16 +0200854static const char*
855unicode_kind_name(PyObject *unicode)
856{
Victor Stinner42dfd712011-10-03 14:41:45 +0200857 /* don't check consistency: unicode_kind_name() is called from
858 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200859 if (!PyUnicode_IS_COMPACT(unicode))
860 {
861 if (!PyUnicode_IS_READY(unicode))
862 return "wstr";
863 switch(PyUnicode_KIND(unicode))
864 {
865 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200866 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200867 return "legacy ascii";
868 else
869 return "legacy latin1";
870 case PyUnicode_2BYTE_KIND:
871 return "legacy UCS2";
872 case PyUnicode_4BYTE_KIND:
873 return "legacy UCS4";
874 default:
875 return "<legacy invalid kind>";
876 }
877 }
878 assert(PyUnicode_IS_READY(unicode));
879 switch(PyUnicode_KIND(unicode))
880 {
881 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200882 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200883 return "ascii";
884 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200885 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200887 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200888 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 default:
891 return "<invalid compact kind>";
892 }
893}
894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200895#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200896static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897
898/* Functions wrapping macros for use in debugger */
899char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200900 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901}
902
903void *_PyUnicode_compact_data(void *unicode) {
904 return _PyUnicode_COMPACT_DATA(unicode);
905}
906void *_PyUnicode_data(void *unicode){
907 printf("obj %p\n", unicode);
908 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
909 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
910 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
911 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
912 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
913 return PyUnicode_DATA(unicode);
914}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200915
916void
917_PyUnicode_Dump(PyObject *op)
918{
919 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200920 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
921 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
922 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200923
Victor Stinnera849a4b2011-10-03 12:12:11 +0200924 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200925 {
926 if (ascii->state.ascii)
927 data = (ascii + 1);
928 else
929 data = (compact + 1);
930 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200931 else
932 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200933 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
934
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 if (ascii->wstr == data)
936 printf("shared ");
937 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200938
Victor Stinnera3b334d2011-10-03 13:53:37 +0200939 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 printf(" (%zu), ", compact->wstr_length);
941 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
942 printf("shared ");
943 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200944 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200946}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200947#endif
948
949PyObject *
950PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
951{
952 PyObject *obj;
953 PyCompactUnicodeObject *unicode;
954 void *data;
955 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200956 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 Py_ssize_t char_size;
958 Py_ssize_t struct_size;
959
960 /* Optimization for empty strings */
961 if (size == 0 && unicode_empty != NULL) {
962 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200963 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 }
965
966#ifdef Py_DEBUG
967 ++unicode_new_new_calls;
968#endif
969
Victor Stinner9e9d6892011-10-04 01:02:02 +0200970 is_ascii = 0;
971 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972 struct_size = sizeof(PyCompactUnicodeObject);
973 if (maxchar < 128) {
974 kind_state = PyUnicode_1BYTE_KIND;
975 char_size = 1;
976 is_ascii = 1;
977 struct_size = sizeof(PyASCIIObject);
978 }
979 else if (maxchar < 256) {
980 kind_state = PyUnicode_1BYTE_KIND;
981 char_size = 1;
982 }
983 else if (maxchar < 65536) {
984 kind_state = PyUnicode_2BYTE_KIND;
985 char_size = 2;
986 if (sizeof(wchar_t) == 2)
987 is_sharing = 1;
988 }
989 else {
990 kind_state = PyUnicode_4BYTE_KIND;
991 char_size = 4;
992 if (sizeof(wchar_t) == 4)
993 is_sharing = 1;
994 }
995
996 /* Ensure we won't overflow the size. */
997 if (size < 0) {
998 PyErr_SetString(PyExc_SystemError,
999 "Negative size passed to PyUnicode_New");
1000 return NULL;
1001 }
1002 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1003 return PyErr_NoMemory();
1004
1005 /* Duplicated allocation code from _PyObject_New() instead of a call to
1006 * PyObject_New() so we are able to allocate space for the object and
1007 * it's data buffer.
1008 */
1009 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1010 if (obj == NULL)
1011 return PyErr_NoMemory();
1012 obj = PyObject_INIT(obj, &PyUnicode_Type);
1013 if (obj == NULL)
1014 return NULL;
1015
1016 unicode = (PyCompactUnicodeObject *)obj;
1017 if (is_ascii)
1018 data = ((PyASCIIObject*)obj) + 1;
1019 else
1020 data = unicode + 1;
1021 _PyUnicode_LENGTH(unicode) = size;
1022 _PyUnicode_HASH(unicode) = -1;
1023 _PyUnicode_STATE(unicode).interned = 0;
1024 _PyUnicode_STATE(unicode).kind = kind_state;
1025 _PyUnicode_STATE(unicode).compact = 1;
1026 _PyUnicode_STATE(unicode).ready = 1;
1027 _PyUnicode_STATE(unicode).ascii = is_ascii;
1028 if (is_ascii) {
1029 ((char*)data)[size] = 0;
1030 _PyUnicode_WSTR(unicode) = NULL;
1031 }
1032 else if (kind_state == PyUnicode_1BYTE_KIND) {
1033 ((char*)data)[size] = 0;
1034 _PyUnicode_WSTR(unicode) = NULL;
1035 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001036 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 }
1039 else {
1040 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001041 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 if (kind_state == PyUnicode_2BYTE_KIND)
1043 ((Py_UCS2*)data)[size] = 0;
1044 else /* kind_state == PyUnicode_4BYTE_KIND */
1045 ((Py_UCS4*)data)[size] = 0;
1046 if (is_sharing) {
1047 _PyUnicode_WSTR_LENGTH(unicode) = size;
1048 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1049 }
1050 else {
1051 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1052 _PyUnicode_WSTR(unicode) = NULL;
1053 }
1054 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001055 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 return obj;
1057}
1058
1059#if SIZEOF_WCHAR_T == 2
1060/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1061 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001062 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063
1064 This function assumes that unicode can hold one more code point than wstr
1065 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001066static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001068 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069{
1070 const wchar_t *iter;
1071 Py_UCS4 *ucs4_out;
1072
Victor Stinner910337b2011-10-03 03:20:16 +02001073 assert(unicode != NULL);
1074 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1076 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1077
1078 for (iter = begin; iter < end; ) {
1079 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1080 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001081 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1082 && (iter+1) < end
1083 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 {
Victor Stinner551ac952011-11-29 22:58:13 +01001085 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 iter += 2;
1087 }
1088 else {
1089 *ucs4_out++ = *iter;
1090 iter++;
1091 }
1092 }
1093 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1094 _PyUnicode_GET_LENGTH(unicode)));
1095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096}
1097#endif
1098
Victor Stinnercd9950f2011-10-02 00:34:53 +02001099static int
1100_PyUnicode_Dirty(PyObject *unicode)
1101{
Victor Stinner910337b2011-10-03 03:20:16 +02001102 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001104 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001105 "Cannot modify a string having more than 1 reference");
1106 return -1;
1107 }
1108 _PyUnicode_DIRTY(unicode);
1109 return 0;
1110}
1111
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001112static int
1113_copy_characters(PyObject *to, Py_ssize_t to_start,
1114 PyObject *from, Py_ssize_t from_start,
1115 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001117 unsigned int from_kind, to_kind;
1118 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001119 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121 assert(PyUnicode_Check(from));
1122 assert(PyUnicode_Check(to));
1123 assert(PyUnicode_IS_READY(from));
1124 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001126 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1127 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1128 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001130 if (how_many == 0)
1131 return 0;
1132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001138#ifdef Py_DEBUG
1139 if (!check_maxchar
1140 && (from_kind > to_kind
1141 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001142 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001143 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1144 Py_UCS4 ch;
1145 Py_ssize_t i;
1146 for (i=0; i < how_many; i++) {
1147 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1148 assert(ch <= to_maxchar);
1149 }
1150 }
1151#endif
1152 fast = (from_kind == to_kind);
1153 if (check_maxchar
1154 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1155 {
1156 /* deny latin1 => ascii */
1157 fast = 0;
1158 }
1159
1160 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001161 Py_MEMCPY((char*)to_data + to_kind * to_start,
1162 (char*)from_data + from_kind * from_start,
1163 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001164 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001165 else if (from_kind == PyUnicode_1BYTE_KIND
1166 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001167 {
1168 _PyUnicode_CONVERT_BYTES(
1169 Py_UCS1, Py_UCS2,
1170 PyUnicode_1BYTE_DATA(from) + from_start,
1171 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1172 PyUnicode_2BYTE_DATA(to) + to_start
1173 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001174 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001175 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 && to_kind == PyUnicode_4BYTE_KIND)
1177 {
1178 _PyUnicode_CONVERT_BYTES(
1179 Py_UCS1, Py_UCS4,
1180 PyUnicode_1BYTE_DATA(from) + from_start,
1181 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1182 PyUnicode_4BYTE_DATA(to) + to_start
1183 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001184 }
1185 else if (from_kind == PyUnicode_2BYTE_KIND
1186 && to_kind == PyUnicode_4BYTE_KIND)
1187 {
1188 _PyUnicode_CONVERT_BYTES(
1189 Py_UCS2, Py_UCS4,
1190 PyUnicode_2BYTE_DATA(from) + from_start,
1191 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1192 PyUnicode_4BYTE_DATA(to) + to_start
1193 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001194 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001196 /* check if max_char(from substring) <= max_char(to) */
1197 if (from_kind > to_kind
1198 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001199 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001201 /* slow path to check for character overflow */
1202 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001203 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 Py_ssize_t i;
1205
Victor Stinner56c161a2011-10-06 02:47:11 +02001206#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 for (i=0; i < how_many; i++) {
1208 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001209 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1211 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001212#else
1213 if (!check_maxchar) {
1214 for (i=0; i < how_many; i++) {
1215 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1216 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1217 }
1218 }
1219 else {
1220 for (i=0; i < how_many; i++) {
1221 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1222 if (ch > to_maxchar)
1223 return 1;
1224 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1225 }
1226 }
1227#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001228 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001229 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001230 assert(0 && "inconsistent state");
1231 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 }
1233 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001234 return 0;
1235}
1236
1237static void
1238copy_characters(PyObject *to, Py_ssize_t to_start,
1239 PyObject *from, Py_ssize_t from_start,
1240 Py_ssize_t how_many)
1241{
1242 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1243}
1244
1245Py_ssize_t
1246PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1247 PyObject *from, Py_ssize_t from_start,
1248 Py_ssize_t how_many)
1249{
1250 int err;
1251
1252 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1253 PyErr_BadInternalCall();
1254 return -1;
1255 }
1256
1257 if (PyUnicode_READY(from))
1258 return -1;
1259 if (PyUnicode_READY(to))
1260 return -1;
1261
1262 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1263 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1264 PyErr_Format(PyExc_SystemError,
1265 "Cannot write %zi characters at %zi "
1266 "in a string of %zi characters",
1267 how_many, to_start, PyUnicode_GET_LENGTH(to));
1268 return -1;
1269 }
1270
1271 if (how_many == 0)
1272 return 0;
1273
1274 if (_PyUnicode_Dirty(to))
1275 return -1;
1276
1277 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1278 if (err) {
1279 PyErr_Format(PyExc_SystemError,
1280 "Cannot copy %s characters "
1281 "into a string of %s characters",
1282 unicode_kind_name(from),
1283 unicode_kind_name(to));
1284 return -1;
1285 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001286 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287}
1288
Victor Stinner17222162011-09-28 22:15:37 +02001289/* Find the maximum code point and count the number of surrogate pairs so a
1290 correct string length can be computed before converting a string to UCS4.
1291 This function counts single surrogates as a character and not as a pair.
1292
1293 Return 0 on success, or -1 on error. */
1294static int
1295find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1296 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297{
1298 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001299 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001300
Victor Stinnerc53be962011-10-02 21:33:54 +02001301 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302 *num_surrogates = 0;
1303 *maxchar = 0;
1304
1305 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001307 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1308 && (iter+1) < end
1309 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001311 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 iter += 2;
1314 }
1315 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001317 {
1318 ch = *iter;
1319 iter++;
1320 }
1321 if (ch > *maxchar) {
1322 *maxchar = ch;
1323 if (*maxchar > MAX_UNICODE) {
1324 PyErr_Format(PyExc_ValueError,
1325 "character U+%x is not in range [U+0000; U+10ffff]",
1326 ch);
1327 return -1;
1328 }
1329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 }
1331 return 0;
1332}
1333
1334#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001335static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#endif
1337
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001338int
1339_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340{
1341 wchar_t *end;
1342 Py_UCS4 maxchar = 0;
1343 Py_ssize_t num_surrogates;
1344#if SIZEOF_WCHAR_T == 2
1345 Py_ssize_t length_wo_surrogates;
1346#endif
1347
Georg Brandl7597add2011-10-05 16:36:47 +02001348 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001349 strings were created using _PyObject_New() and where no canonical
1350 representation (the str field) has been set yet aka strings
1351 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001352 assert(_PyUnicode_CHECK(unicode));
1353 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001355 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001356 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001357 /* Actually, it should neither be interned nor be anything else: */
1358 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001359
1360#ifdef Py_DEBUG
1361 ++unicode_ready_calls;
1362#endif
1363
1364 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001365 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001366 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368
1369 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001370 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1371 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 PyErr_NoMemory();
1373 return -1;
1374 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001375 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 _PyUnicode_WSTR(unicode), end,
1377 PyUnicode_1BYTE_DATA(unicode));
1378 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1379 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1380 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1381 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001382 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001383 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001384 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001387 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8(unicode) = NULL;
1389 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 PyObject_FREE(_PyUnicode_WSTR(unicode));
1392 _PyUnicode_WSTR(unicode) = NULL;
1393 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1394 }
1395 /* In this case we might have to convert down from 4-byte native
1396 wchar_t to 2-byte unicode. */
1397 else if (maxchar < 65536) {
1398 assert(num_surrogates == 0 &&
1399 "FindMaxCharAndNumSurrogatePairs() messed up");
1400
Victor Stinner506f5922011-09-28 22:34:18 +02001401#if SIZEOF_WCHAR_T == 2
1402 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001403 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001404 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1405 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1406 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001407 _PyUnicode_UTF8(unicode) = NULL;
1408 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001409#else
1410 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001412 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001414 PyErr_NoMemory();
1415 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 }
Victor Stinner506f5922011-09-28 22:34:18 +02001417 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1418 _PyUnicode_WSTR(unicode), end,
1419 PyUnicode_2BYTE_DATA(unicode));
1420 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1421 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1422 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001423 _PyUnicode_UTF8(unicode) = NULL;
1424 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001425 PyObject_FREE(_PyUnicode_WSTR(unicode));
1426 _PyUnicode_WSTR(unicode) = NULL;
1427 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1428#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 }
1430 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1431 else {
1432#if SIZEOF_WCHAR_T == 2
1433 /* in case the native representation is 2-bytes, we need to allocate a
1434 new normalized 4-byte version. */
1435 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1437 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 PyErr_NoMemory();
1439 return -1;
1440 }
1441 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1442 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001443 _PyUnicode_UTF8(unicode) = NULL;
1444 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001445 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1446 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001447 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 PyObject_FREE(_PyUnicode_WSTR(unicode));
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451#else
1452 assert(num_surrogates == 0);
1453
Victor Stinnerc3c74152011-10-02 20:39:55 +02001454 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1459#endif
1460 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1461 }
1462 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001463 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return 0;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001468unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469{
Walter Dörwald16807132007-05-25 13:52:07 +00001470 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 case SSTATE_NOT_INTERNED:
1472 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001473
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 case SSTATE_INTERNED_MORTAL:
1475 /* revive dead object temporarily for DelItem */
1476 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001477 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 Py_FatalError(
1479 "deletion of interned string failed");
1480 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 case SSTATE_INTERNED_IMMORTAL:
1483 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001484
Benjamin Peterson29060642009-01-31 22:14:21 +00001485 default:
1486 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001487 }
1488
Victor Stinner03490912011-10-03 23:45:12 +02001489 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001491 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001492 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493
1494 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
1497 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001498 if (_PyUnicode_DATA_ANY(unicode))
1499 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001500 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 }
1502}
1503
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001504#ifdef Py_DEBUG
1505static int
1506unicode_is_singleton(PyObject *unicode)
1507{
1508 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1509 if (unicode == unicode_empty)
1510 return 1;
1511 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1512 {
1513 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1514 if (ch < 256 && unicode_latin1[ch] == unicode)
1515 return 1;
1516 }
1517 return 0;
1518}
1519#endif
1520
Alexander Belopolsky40018472011-02-26 01:02:56 +00001521static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001523{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001524 if (Py_REFCNT(unicode) != 1)
1525 return 0;
1526 if (PyUnicode_CHECK_INTERNED(unicode))
1527 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001528#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 /* singleton refcount is greater than 1 */
1530 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001531#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 return 1;
1533}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001534
Victor Stinnerfe226c02011-10-03 03:52:20 +02001535static int
1536unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1537{
1538 PyObject *unicode;
1539 Py_ssize_t old_length;
1540
1541 assert(p_unicode != NULL);
1542 unicode = *p_unicode;
1543
1544 assert(unicode != NULL);
1545 assert(PyUnicode_Check(unicode));
1546 assert(0 <= length);
1547
Victor Stinner910337b2011-10-03 03:20:16 +02001548 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 old_length = PyUnicode_WSTR_LENGTH(unicode);
1550 else
1551 old_length = PyUnicode_GET_LENGTH(unicode);
1552 if (old_length == length)
1553 return 0;
1554
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001555 if (length == 0) {
1556 Py_DECREF(*p_unicode);
1557 *p_unicode = unicode_empty;
1558 Py_INCREF(*p_unicode);
1559 return 0;
1560 }
1561
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 if (!unicode_resizable(unicode)) {
1563 PyObject *copy = resize_copy(unicode, length);
1564 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001566 Py_DECREF(*p_unicode);
1567 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001568 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001569 }
1570
Victor Stinnerfe226c02011-10-03 03:52:20 +02001571 if (PyUnicode_IS_COMPACT(unicode)) {
1572 *p_unicode = resize_compact(unicode, length);
1573 if (*p_unicode == NULL)
1574 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001575 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001577 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001578 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001579}
1580
Alexander Belopolsky40018472011-02-26 01:02:56 +00001581int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001583{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 PyObject *unicode;
1585 if (p_unicode == NULL) {
1586 PyErr_BadInternalCall();
1587 return -1;
1588 }
1589 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001590 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001591 {
1592 PyErr_BadInternalCall();
1593 return -1;
1594 }
1595 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001596}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001597
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001599unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001600{
1601 PyObject *result;
1602 assert(PyUnicode_IS_READY(*p_unicode));
1603 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1604 return 0;
1605 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1606 maxchar);
1607 if (result == NULL)
1608 return -1;
1609 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1610 PyUnicode_GET_LENGTH(*p_unicode));
1611 Py_DECREF(*p_unicode);
1612 *p_unicode = result;
1613 return 0;
1614}
1615
1616static int
1617unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1618 Py_UCS4 ch)
1619{
1620 if (unicode_widen(p_unicode, ch) < 0)
1621 return -1;
1622 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1623 PyUnicode_DATA(*p_unicode),
1624 (*pos)++, ch);
1625 return 0;
1626}
1627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001628static PyObject*
1629get_latin1_char(unsigned char ch)
1630{
Victor Stinnera464fc12011-10-02 20:39:30 +02001631 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001633 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634 if (!unicode)
1635 return NULL;
1636 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001637 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 unicode_latin1[ch] = unicode;
1639 }
1640 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642}
1643
Alexander Belopolsky40018472011-02-26 01:02:56 +00001644PyObject *
1645PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001647 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001648 Py_UCS4 maxchar = 0;
1649 Py_ssize_t num_surrogates;
1650
1651 if (u == NULL)
1652 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654 /* If the Unicode data is known at construction time, we can apply
1655 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001657 /* Optimization for empty strings */
1658 if (size == 0 && unicode_empty != NULL) {
1659 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001660 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001661 }
Tim Petersced69f82003-09-16 20:30:58 +00001662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001663 /* Single character Unicode objects in the Latin-1 range are
1664 shared when using this constructor */
1665 if (size == 1 && *u < 256)
1666 return get_latin1_char((unsigned char)*u);
1667
1668 /* If not empty and not single character, copy the Unicode data
1669 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001670 if (find_maxchar_surrogates(u, u + size,
1671 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 return NULL;
1673
Victor Stinner8faf8212011-12-08 22:14:11 +01001674 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 if (!unicode)
1676 return NULL;
1677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678 switch (PyUnicode_KIND(unicode)) {
1679 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001680 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1682 break;
1683 case PyUnicode_2BYTE_KIND:
1684#if Py_UNICODE_SIZE == 2
1685 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1686#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001687 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001688 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1689#endif
1690 break;
1691 case PyUnicode_4BYTE_KIND:
1692#if SIZEOF_WCHAR_T == 2
1693 /* This is the only case which has to process surrogates, thus
1694 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001695 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001696#else
1697 assert(num_surrogates == 0);
1698 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1699#endif
1700 break;
1701 default:
1702 assert(0 && "Impossible state");
1703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001705 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706}
1707
Alexander Belopolsky40018472011-02-26 01:02:56 +00001708PyObject *
1709PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001710{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001711 if (size < 0) {
1712 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001713 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001714 return NULL;
1715 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001716
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001717 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001718 some optimizations which share commonly used objects.
1719 Also, this means the input must be UTF-8, so fall back to the
1720 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001721 if (u != NULL) {
1722
Benjamin Peterson29060642009-01-31 22:14:21 +00001723 /* Optimization for empty strings */
1724 if (size == 0 && unicode_empty != NULL) {
1725 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001726 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001727 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001728
1729 /* Single characters are shared when using this constructor.
1730 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001731 if (size == 1 && (unsigned char)*u < 128)
1732 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001733
1734 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001735 }
1736
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001737 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001738}
1739
Alexander Belopolsky40018472011-02-26 01:02:56 +00001740PyObject *
1741PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001742{
1743 size_t size = strlen(u);
1744 if (size > PY_SSIZE_T_MAX) {
1745 PyErr_SetString(PyExc_OverflowError, "input too long");
1746 return NULL;
1747 }
1748
1749 return PyUnicode_FromStringAndSize(u, size);
1750}
1751
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001752PyObject *
1753_PyUnicode_FromId(_Py_Identifier *id)
1754{
1755 if (!id->object) {
1756 id->object = PyUnicode_FromString(id->string);
1757 if (!id->object)
1758 return NULL;
1759 PyUnicode_InternInPlace(&id->object);
1760 assert(!id->next);
1761 id->next = static_strings;
1762 static_strings = id;
1763 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001764 return id->object;
1765}
1766
1767void
1768_PyUnicode_ClearStaticStrings()
1769{
1770 _Py_Identifier *i;
1771 for (i = static_strings; i; i = i->next) {
1772 Py_DECREF(i->object);
1773 i->object = NULL;
1774 i->next = NULL;
1775 }
1776}
1777
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001778/* Internal function, don't check maximum character */
1779
Victor Stinnere57b1c02011-09-28 22:20:48 +02001780static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001781unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001782{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001783 PyObject *res;
1784#ifdef Py_DEBUG
1785 const unsigned char *p;
1786 const unsigned char *end = s + size;
1787 for (p=s; p < end; p++) {
1788 assert(*p < 128);
1789 }
1790#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001791 if (size == 1)
1792 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001793 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001794 if (!res)
1795 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001796 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001797 return res;
1798}
1799
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001800static Py_UCS4
1801kind_maxchar_limit(unsigned int kind)
1802{
1803 switch(kind) {
1804 case PyUnicode_1BYTE_KIND:
1805 return 0x80;
1806 case PyUnicode_2BYTE_KIND:
1807 return 0x100;
1808 case PyUnicode_4BYTE_KIND:
1809 return 0x10000;
1810 default:
1811 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001812 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001813 }
1814}
1815
Victor Stinner702c7342011-10-05 13:50:52 +02001816static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001817_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001821
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001822 if (size == 0) {
1823 Py_INCREF(unicode_empty);
1824 return unicode_empty;
1825 }
1826 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001827 if (size == 1)
1828 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001830 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001831 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 if (!res)
1833 return NULL;
1834 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001835 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001837}
1838
Victor Stinnere57b1c02011-09-28 22:20:48 +02001839static PyObject*
1840_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841{
1842 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001844
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001845 if (size == 0) {
1846 Py_INCREF(unicode_empty);
1847 return unicode_empty;
1848 }
1849 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001850 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001851 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001853 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001854 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 if (!res)
1856 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001857 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001858 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001859 else {
1860 _PyUnicode_CONVERT_BYTES(
1861 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1862 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001863 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 return res;
1865}
1866
Victor Stinnere57b1c02011-09-28 22:20:48 +02001867static PyObject*
1868_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869{
1870 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001871 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001872
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001873 if (size == 0) {
1874 Py_INCREF(unicode_empty);
1875 return unicode_empty;
1876 }
1877 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001878 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001879 return get_latin1_char((unsigned char)u[0]);
1880
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001881 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001882 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 if (!res)
1884 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001885 if (max_char < 256)
1886 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1887 PyUnicode_1BYTE_DATA(res));
1888 else if (max_char < 0x10000)
1889 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1890 PyUnicode_2BYTE_DATA(res));
1891 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return res;
1895}
1896
1897PyObject*
1898PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1899{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001900 if (size < 0) {
1901 PyErr_SetString(PyExc_ValueError, "size must be positive");
1902 return NULL;
1903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 switch(kind) {
1905 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001908 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001911 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001912 PyErr_SetString(PyExc_SystemError, "invalid kind");
1913 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915}
1916
Victor Stinner25a4b292011-10-06 12:31:55 +02001917/* Ensure that a string uses the most efficient storage, if it is not the
1918 case: create a new string with of the right kind. Write NULL into *p_unicode
1919 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001920static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001921unicode_adjust_maxchar(PyObject **p_unicode)
1922{
1923 PyObject *unicode, *copy;
1924 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001925 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001926 unsigned int kind;
1927
1928 assert(p_unicode != NULL);
1929 unicode = *p_unicode;
1930 assert(PyUnicode_IS_READY(unicode));
1931 if (PyUnicode_IS_ASCII(unicode))
1932 return;
1933
1934 len = PyUnicode_GET_LENGTH(unicode);
1935 kind = PyUnicode_KIND(unicode);
1936 if (kind == PyUnicode_1BYTE_KIND) {
1937 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001938 max_char = ucs1lib_find_max_char(u, u + len);
1939 if (max_char >= 128)
1940 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001941 }
1942 else if (kind == PyUnicode_2BYTE_KIND) {
1943 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs2lib_find_max_char(u, u + len);
1945 if (max_char >= 256)
1946 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001947 }
1948 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001950 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001951 max_char = ucs4lib_find_max_char(u, u + len);
1952 if (max_char >= 0x10000)
1953 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001954 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001955 copy = PyUnicode_New(len, max_char);
1956 copy_characters(copy, 0, unicode, 0, len);
1957 Py_DECREF(unicode);
1958 *p_unicode = copy;
1959}
1960
Victor Stinner034f6cf2011-09-30 02:26:44 +02001961PyObject*
1962PyUnicode_Copy(PyObject *unicode)
1963{
Victor Stinner87af4f22011-11-21 23:03:47 +01001964 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001965 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001966
Victor Stinner034f6cf2011-09-30 02:26:44 +02001967 if (!PyUnicode_Check(unicode)) {
1968 PyErr_BadInternalCall();
1969 return NULL;
1970 }
1971 if (PyUnicode_READY(unicode))
1972 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001973
Victor Stinner87af4f22011-11-21 23:03:47 +01001974 length = PyUnicode_GET_LENGTH(unicode);
1975 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001976 if (!copy)
1977 return NULL;
1978 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1979
Victor Stinner87af4f22011-11-21 23:03:47 +01001980 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1981 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001983 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001984}
1985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986
Victor Stinnerbc603d12011-10-02 01:00:40 +02001987/* Widen Unicode objects to larger buffers. Don't write terminating null
1988 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989
1990void*
1991_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1992{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001993 Py_ssize_t len;
1994 void *result;
1995 unsigned int skind;
1996
1997 if (PyUnicode_READY(s))
1998 return NULL;
1999
2000 len = PyUnicode_GET_LENGTH(s);
2001 skind = PyUnicode_KIND(s);
2002 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002003 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 return NULL;
2005 }
2006 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002007 case PyUnicode_2BYTE_KIND:
2008 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2009 if (!result)
2010 return PyErr_NoMemory();
2011 assert(skind == PyUnicode_1BYTE_KIND);
2012 _PyUnicode_CONVERT_BYTES(
2013 Py_UCS1, Py_UCS2,
2014 PyUnicode_1BYTE_DATA(s),
2015 PyUnicode_1BYTE_DATA(s) + len,
2016 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002018 case PyUnicode_4BYTE_KIND:
2019 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2020 if (!result)
2021 return PyErr_NoMemory();
2022 if (skind == PyUnicode_2BYTE_KIND) {
2023 _PyUnicode_CONVERT_BYTES(
2024 Py_UCS2, Py_UCS4,
2025 PyUnicode_2BYTE_DATA(s),
2026 PyUnicode_2BYTE_DATA(s) + len,
2027 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002029 else {
2030 assert(skind == PyUnicode_1BYTE_KIND);
2031 _PyUnicode_CONVERT_BYTES(
2032 Py_UCS1, Py_UCS4,
2033 PyUnicode_1BYTE_DATA(s),
2034 PyUnicode_1BYTE_DATA(s) + len,
2035 result);
2036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002038 default:
2039 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 }
Victor Stinner01698042011-10-04 00:04:26 +02002041 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return NULL;
2043}
2044
2045static Py_UCS4*
2046as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2047 int copy_null)
2048{
2049 int kind;
2050 void *data;
2051 Py_ssize_t len, targetlen;
2052 if (PyUnicode_READY(string) == -1)
2053 return NULL;
2054 kind = PyUnicode_KIND(string);
2055 data = PyUnicode_DATA(string);
2056 len = PyUnicode_GET_LENGTH(string);
2057 targetlen = len;
2058 if (copy_null)
2059 targetlen++;
2060 if (!target) {
2061 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2062 PyErr_NoMemory();
2063 return NULL;
2064 }
2065 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2066 if (!target) {
2067 PyErr_NoMemory();
2068 return NULL;
2069 }
2070 }
2071 else {
2072 if (targetsize < targetlen) {
2073 PyErr_Format(PyExc_SystemError,
2074 "string is longer than the buffer");
2075 if (copy_null && 0 < targetsize)
2076 target[0] = 0;
2077 return NULL;
2078 }
2079 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002080 if (kind == PyUnicode_1BYTE_KIND) {
2081 Py_UCS1 *start = (Py_UCS1 *) data;
2082 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002084 else if (kind == PyUnicode_2BYTE_KIND) {
2085 Py_UCS2 *start = (Py_UCS2 *) data;
2086 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2087 }
2088 else {
2089 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 if (copy_null)
2093 target[len] = 0;
2094 return target;
2095}
2096
2097Py_UCS4*
2098PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2099 int copy_null)
2100{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002101 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 PyErr_BadInternalCall();
2103 return NULL;
2104 }
2105 return as_ucs4(string, target, targetsize, copy_null);
2106}
2107
2108Py_UCS4*
2109PyUnicode_AsUCS4Copy(PyObject *string)
2110{
2111 return as_ucs4(string, NULL, 0, 1);
2112}
2113
2114#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002115
Alexander Belopolsky40018472011-02-26 01:02:56 +00002116PyObject *
2117PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002120 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 PyErr_BadInternalCall();
2123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 }
2125
Martin v. Löwis790465f2008-04-05 20:41:37 +00002126 if (size == -1) {
2127 size = wcslen(w);
2128 }
2129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131}
2132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002134
Walter Dörwald346737f2007-05-31 10:44:43 +00002135static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002136makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2137 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002138{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002139 *fmt++ = '%';
2140 if (width) {
2141 if (zeropad)
2142 *fmt++ = '0';
2143 fmt += sprintf(fmt, "%d", width);
2144 }
2145 if (precision)
2146 fmt += sprintf(fmt, ".%d", precision);
2147 if (longflag)
2148 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002149 else if (longlongflag) {
2150 /* longlongflag should only ever be nonzero on machines with
2151 HAVE_LONG_LONG defined */
2152#ifdef HAVE_LONG_LONG
2153 char *f = PY_FORMAT_LONG_LONG;
2154 while (*f)
2155 *fmt++ = *f++;
2156#else
2157 /* we shouldn't ever get here */
2158 assert(0);
2159 *fmt++ = 'l';
2160#endif
2161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002162 else if (size_tflag) {
2163 char *f = PY_FORMAT_SIZE_T;
2164 while (*f)
2165 *fmt++ = *f++;
2166 }
2167 *fmt++ = c;
2168 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002169}
2170
Victor Stinner96865452011-03-01 23:44:09 +00002171/* helper for PyUnicode_FromFormatV() */
2172
2173static const char*
2174parse_format_flags(const char *f,
2175 int *p_width, int *p_precision,
2176 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2177{
2178 int width, precision, longflag, longlongflag, size_tflag;
2179
2180 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2181 f++;
2182 width = 0;
2183 while (Py_ISDIGIT((unsigned)*f))
2184 width = (width*10) + *f++ - '0';
2185 precision = 0;
2186 if (*f == '.') {
2187 f++;
2188 while (Py_ISDIGIT((unsigned)*f))
2189 precision = (precision*10) + *f++ - '0';
2190 if (*f == '%') {
2191 /* "%.3%s" => f points to "3" */
2192 f--;
2193 }
2194 }
2195 if (*f == '\0') {
2196 /* bogus format "%.1" => go backward, f points to "1" */
2197 f--;
2198 }
2199 if (p_width != NULL)
2200 *p_width = width;
2201 if (p_precision != NULL)
2202 *p_precision = precision;
2203
2204 /* Handle %ld, %lu, %lld and %llu. */
2205 longflag = 0;
2206 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002207 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002208
2209 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002210 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002211 longflag = 1;
2212 ++f;
2213 }
2214#ifdef HAVE_LONG_LONG
2215 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002217 longlongflag = 1;
2218 f += 2;
2219 }
2220#endif
2221 }
2222 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002223 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002224 size_tflag = 1;
2225 ++f;
2226 }
2227 if (p_longflag != NULL)
2228 *p_longflag = longflag;
2229 if (p_longlongflag != NULL)
2230 *p_longlongflag = longlongflag;
2231 if (p_size_tflag != NULL)
2232 *p_size_tflag = size_tflag;
2233 return f;
2234}
2235
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002236/* maximum number of characters required for output of %ld. 21 characters
2237 allows for 64-bit integers (in decimal) and an optional sign. */
2238#define MAX_LONG_CHARS 21
2239/* maximum number of characters required for output of %lld.
2240 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2241 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2242#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2243
Walter Dörwaldd2034312007-05-18 16:29:38 +00002244PyObject *
2245PyUnicode_FromFormatV(const char *format, va_list vargs)
2246{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 va_list count;
2248 Py_ssize_t callcount = 0;
2249 PyObject **callresults = NULL;
2250 PyObject **callresult = NULL;
2251 Py_ssize_t n = 0;
2252 int width = 0;
2253 int precision = 0;
2254 int zeropad;
2255 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002256 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002257 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002258 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2260 Py_UCS4 argmaxchar;
2261 Py_ssize_t numbersize = 0;
2262 char *numberresults = NULL;
2263 char *numberresult = NULL;
2264 Py_ssize_t i;
2265 int kind;
2266 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002267
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002268 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002269 /* step 1: count the number of %S/%R/%A/%s format specifications
2270 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2271 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002273 * also estimate a upper bound for all the number formats in the string,
2274 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002276 for (f = format; *f; f++) {
2277 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002278 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2280 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2281 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2282 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002285#ifdef HAVE_LONG_LONG
2286 if (longlongflag) {
2287 if (width < MAX_LONG_LONG_CHARS)
2288 width = MAX_LONG_LONG_CHARS;
2289 }
2290 else
2291#endif
2292 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2293 including sign. Decimal takes the most space. This
2294 isn't enough for octal. If a width is specified we
2295 need more (which we allocate later). */
2296 if (width < MAX_LONG_CHARS)
2297 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298
2299 /* account for the size + '\0' to separate numbers
2300 inside of the numberresults buffer */
2301 numbersize += (width + 1);
2302 }
2303 }
2304 else if ((unsigned char)*f > 127) {
2305 PyErr_Format(PyExc_ValueError,
2306 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2307 "string, got a non-ASCII byte: 0x%02x",
2308 (unsigned char)*f);
2309 return NULL;
2310 }
2311 }
2312 /* step 2: allocate memory for the results of
2313 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2314 if (callcount) {
2315 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2316 if (!callresults) {
2317 PyErr_NoMemory();
2318 return NULL;
2319 }
2320 callresult = callresults;
2321 }
2322 /* step 2.5: allocate memory for the results of formating numbers */
2323 if (numbersize) {
2324 numberresults = PyObject_Malloc(numbersize);
2325 if (!numberresults) {
2326 PyErr_NoMemory();
2327 goto fail;
2328 }
2329 numberresult = numberresults;
2330 }
2331
2332 /* step 3: format numbers and figure out how large a buffer we need */
2333 for (f = format; *f; f++) {
2334 if (*f == '%') {
2335 const char* p;
2336 int longflag;
2337 int longlongflag;
2338 int size_tflag;
2339 int numprinted;
2340
2341 p = f;
2342 zeropad = (f[1] == '0');
2343 f = parse_format_flags(f, &width, &precision,
2344 &longflag, &longlongflag, &size_tflag);
2345 switch (*f) {
2346 case 'c':
2347 {
2348 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002349 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 n++;
2351 break;
2352 }
2353 case '%':
2354 n++;
2355 break;
2356 case 'i':
2357 case 'd':
2358 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2359 width, precision, *f);
2360 if (longflag)
2361 numprinted = sprintf(numberresult, fmt,
2362 va_arg(count, long));
2363#ifdef HAVE_LONG_LONG
2364 else if (longlongflag)
2365 numprinted = sprintf(numberresult, fmt,
2366 va_arg(count, PY_LONG_LONG));
2367#endif
2368 else if (size_tflag)
2369 numprinted = sprintf(numberresult, fmt,
2370 va_arg(count, Py_ssize_t));
2371 else
2372 numprinted = sprintf(numberresult, fmt,
2373 va_arg(count, int));
2374 n += numprinted;
2375 /* advance by +1 to skip over the '\0' */
2376 numberresult += (numprinted + 1);
2377 assert(*(numberresult - 1) == '\0');
2378 assert(*(numberresult - 2) != '\0');
2379 assert(numprinted >= 0);
2380 assert(numberresult <= numberresults + numbersize);
2381 break;
2382 case 'u':
2383 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2384 width, precision, 'u');
2385 if (longflag)
2386 numprinted = sprintf(numberresult, fmt,
2387 va_arg(count, unsigned long));
2388#ifdef HAVE_LONG_LONG
2389 else if (longlongflag)
2390 numprinted = sprintf(numberresult, fmt,
2391 va_arg(count, unsigned PY_LONG_LONG));
2392#endif
2393 else if (size_tflag)
2394 numprinted = sprintf(numberresult, fmt,
2395 va_arg(count, size_t));
2396 else
2397 numprinted = sprintf(numberresult, fmt,
2398 va_arg(count, unsigned int));
2399 n += numprinted;
2400 numberresult += (numprinted + 1);
2401 assert(*(numberresult - 1) == '\0');
2402 assert(*(numberresult - 2) != '\0');
2403 assert(numprinted >= 0);
2404 assert(numberresult <= numberresults + numbersize);
2405 break;
2406 case 'x':
2407 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2408 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2409 n += numprinted;
2410 numberresult += (numprinted + 1);
2411 assert(*(numberresult - 1) == '\0');
2412 assert(*(numberresult - 2) != '\0');
2413 assert(numprinted >= 0);
2414 assert(numberresult <= numberresults + numbersize);
2415 break;
2416 case 'p':
2417 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2418 /* %p is ill-defined: ensure leading 0x. */
2419 if (numberresult[1] == 'X')
2420 numberresult[1] = 'x';
2421 else if (numberresult[1] != 'x') {
2422 memmove(numberresult + 2, numberresult,
2423 strlen(numberresult) + 1);
2424 numberresult[0] = '0';
2425 numberresult[1] = 'x';
2426 numprinted += 2;
2427 }
2428 n += numprinted;
2429 numberresult += (numprinted + 1);
2430 assert(*(numberresult - 1) == '\0');
2431 assert(*(numberresult - 2) != '\0');
2432 assert(numprinted >= 0);
2433 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 break;
2435 case 's':
2436 {
2437 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002438 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002439 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2440 if (!str)
2441 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 /* since PyUnicode_DecodeUTF8 returns already flexible
2443 unicode objects, there is no need to call ready on them */
2444 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002445 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002447 /* Remember the str and switch to the next slot */
2448 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002449 break;
2450 }
2451 case 'U':
2452 {
2453 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002454 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 if (PyUnicode_READY(obj) == -1)
2456 goto fail;
2457 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002458 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 break;
2461 }
2462 case 'V':
2463 {
2464 PyObject *obj = va_arg(count, PyObject *);
2465 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002466 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002467 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002468 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002469 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002470 if (PyUnicode_READY(obj) == -1)
2471 goto fail;
2472 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002473 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 *callresult++ = NULL;
2476 }
2477 else {
2478 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2479 if (!str_obj)
2480 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002481 if (PyUnicode_READY(str_obj)) {
2482 Py_DECREF(str_obj);
2483 goto fail;
2484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002486 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002488 *callresult++ = str_obj;
2489 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002490 break;
2491 }
2492 case 'S':
2493 {
2494 PyObject *obj = va_arg(count, PyObject *);
2495 PyObject *str;
2496 assert(obj);
2497 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002501 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002503 /* Remember the str and switch to the next slot */
2504 *callresult++ = str;
2505 break;
2506 }
2507 case 'R':
2508 {
2509 PyObject *obj = va_arg(count, PyObject *);
2510 PyObject *repr;
2511 assert(obj);
2512 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002516 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 /* Remember the repr and switch to the next slot */
2519 *callresult++ = repr;
2520 break;
2521 }
2522 case 'A':
2523 {
2524 PyObject *obj = va_arg(count, PyObject *);
2525 PyObject *ascii;
2526 assert(obj);
2527 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002529 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002531 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002533 /* Remember the repr and switch to the next slot */
2534 *callresult++ = ascii;
2535 break;
2536 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002537 default:
2538 /* if we stumble upon an unknown
2539 formatting code, copy the rest of
2540 the format string to the output
2541 string. (we cannot just skip the
2542 code, since there's no way to know
2543 what's in the argument list) */
2544 n += strlen(p);
2545 goto expand;
2546 }
2547 } else
2548 n++;
2549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002551 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 we don't have to resize the string.
2554 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002555 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 if (!string)
2557 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 kind = PyUnicode_KIND(string);
2559 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002563 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002564 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002565 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002566
2567 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2569 /* checking for == because the last argument could be a empty
2570 string, which causes i to point to end, the assert at the end of
2571 the loop */
2572 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002573
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 switch (*f) {
2575 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002576 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 const int ordinal = va_arg(vargs, int);
2578 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002580 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002581 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002585 case 'p':
2586 /* unused, since we already have the result */
2587 if (*f == 'p')
2588 (void) va_arg(vargs, void *);
2589 else
2590 (void) va_arg(vargs, int);
2591 /* extract the result from numberresults and append. */
2592 for (; *numberresult; ++i, ++numberresult)
2593 PyUnicode_WRITE(kind, data, i, *numberresult);
2594 /* skip over the separating '\0' */
2595 assert(*numberresult == '\0');
2596 numberresult++;
2597 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 break;
2599 case 's':
2600 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002603 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 size = PyUnicode_GET_LENGTH(*callresult);
2605 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002606 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 /* We're done with the unicode()/repr() => forget it */
2609 Py_DECREF(*callresult);
2610 /* switch to next unicode()/repr() result */
2611 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 break;
2613 }
2614 case 'U':
2615 {
2616 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 Py_ssize_t size;
2618 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2619 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002620 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 break;
2623 }
2624 case 'V':
2625 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002628 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 size = PyUnicode_GET_LENGTH(obj);
2631 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002632 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 size = PyUnicode_GET_LENGTH(*callresult);
2636 assert(PyUnicode_KIND(*callresult) <=
2637 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002638 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002640 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002642 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 break;
2644 }
2645 case 'S':
2646 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002647 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002649 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 /* unused, since we already have the result */
2651 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002653 copy_characters(string, i, *callresult, 0, size);
2654 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 /* We're done with the unicode()/repr() => forget it */
2656 Py_DECREF(*callresult);
2657 /* switch to next unicode()/repr() result */
2658 ++callresult;
2659 break;
2660 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002663 break;
2664 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002665 for (; *p; ++p, ++i)
2666 PyUnicode_WRITE(kind, data, i, *p);
2667 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 goto end;
2669 }
Victor Stinner1205f272010-09-11 00:54:47 +00002670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 else {
2672 assert(i < PyUnicode_GET_LENGTH(string));
2673 PyUnicode_WRITE(kind, data, i++, *f);
2674 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002677
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 if (callresults)
2680 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 if (numberresults)
2682 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002683 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 if (callresults) {
2686 PyObject **callresult2 = callresults;
2687 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002688 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 ++callresult2;
2690 }
2691 PyObject_Free(callresults);
2692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 if (numberresults)
2694 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696}
2697
Walter Dörwaldd2034312007-05-18 16:29:38 +00002698PyObject *
2699PyUnicode_FromFormat(const char *format, ...)
2700{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 PyObject* ret;
2702 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703
2704#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002706#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 ret = PyUnicode_FromFormatV(format, vargs);
2710 va_end(vargs);
2711 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712}
2713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714#ifdef HAVE_WCHAR_H
2715
Victor Stinner5593d8a2010-10-02 11:11:27 +00002716/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2717 convert a Unicode object to a wide character string.
2718
Victor Stinnerd88d9832011-09-06 02:00:05 +02002719 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002720 character) required to convert the unicode object. Ignore size argument.
2721
Victor Stinnerd88d9832011-09-06 02:00:05 +02002722 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002726unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002727 wchar_t *w,
2728 Py_ssize_t size)
2729{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 const wchar_t *wstr;
2732
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002733 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 if (wstr == NULL)
2735 return -1;
2736
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002738 if (size > res)
2739 size = res + 1;
2740 else
2741 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 return res;
2744 }
2745 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002747}
2748
2749Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002750PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002751 wchar_t *w,
2752 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753{
2754 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002755 PyErr_BadInternalCall();
2756 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002758 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759}
2760
Victor Stinner137c34c2010-09-29 10:25:54 +00002761wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002762PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002763 Py_ssize_t *size)
2764{
2765 wchar_t* buffer;
2766 Py_ssize_t buflen;
2767
2768 if (unicode == NULL) {
2769 PyErr_BadInternalCall();
2770 return NULL;
2771 }
2772
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002773 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 if (buflen == -1)
2775 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002776 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002777 PyErr_NoMemory();
2778 return NULL;
2779 }
2780
Victor Stinner137c34c2010-09-29 10:25:54 +00002781 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2782 if (buffer == NULL) {
2783 PyErr_NoMemory();
2784 return NULL;
2785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002786 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 if (buflen == -1)
2788 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002789 if (size != NULL)
2790 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002791 return buffer;
2792}
2793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795
Alexander Belopolsky40018472011-02-26 01:02:56 +00002796PyObject *
2797PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002800 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002801 PyErr_SetString(PyExc_ValueError,
2802 "chr() arg not in range(0x110000)");
2803 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002804 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 if (ordinal < 256)
2807 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 v = PyUnicode_New(1, ordinal);
2810 if (v == NULL)
2811 return NULL;
2812 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002813 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815}
2816
Alexander Belopolsky40018472011-02-26 01:02:56 +00002817PyObject *
2818PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002821 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002822 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002823 if (PyUnicode_READY(obj))
2824 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002825 Py_INCREF(obj);
2826 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 }
2828 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 /* For a Unicode subtype that's not a Unicode object,
2830 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002831 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002832 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002833 PyErr_Format(PyExc_TypeError,
2834 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002835 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002836 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002837}
2838
Alexander Belopolsky40018472011-02-26 01:02:56 +00002839PyObject *
2840PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002841 const char *encoding,
2842 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002843{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002844 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002845 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002848 PyErr_BadInternalCall();
2849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002851
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002852 /* Decoding bytes objects is the most common case and should be fast */
2853 if (PyBytes_Check(obj)) {
2854 if (PyBytes_GET_SIZE(obj) == 0) {
2855 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002856 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 }
2858 else {
2859 v = PyUnicode_Decode(
2860 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2861 encoding, errors);
2862 }
2863 return v;
2864 }
2865
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002866 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002867 PyErr_SetString(PyExc_TypeError,
2868 "decoding str is not supported");
2869 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002871
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002872 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2873 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2874 PyErr_Format(PyExc_TypeError,
2875 "coercing to str: need bytes, bytearray "
2876 "or buffer-like object, %.80s found",
2877 Py_TYPE(obj)->tp_name);
2878 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002879 }
Tim Petersced69f82003-09-16 20:30:58 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002883 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 }
Tim Petersced69f82003-09-16 20:30:58 +00002885 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002887
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002889 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890}
2891
Victor Stinner600d3be2010-06-10 12:00:55 +00002892/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002893 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2894 1 on success. */
2895static int
2896normalize_encoding(const char *encoding,
2897 char *lower,
2898 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002900 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002901 char *l;
2902 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002904 if (encoding == NULL) {
2905 strcpy(lower, "utf-8");
2906 return 1;
2907 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002908 e = encoding;
2909 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002910 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002911 while (*e) {
2912 if (l == l_end)
2913 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002914 if (Py_ISUPPER(*e)) {
2915 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002916 }
2917 else if (*e == '_') {
2918 *l++ = '-';
2919 e++;
2920 }
2921 else {
2922 *l++ = *e++;
2923 }
2924 }
2925 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002926 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002927}
2928
Alexander Belopolsky40018472011-02-26 01:02:56 +00002929PyObject *
2930PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002931 Py_ssize_t size,
2932 const char *encoding,
2933 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002934{
2935 PyObject *buffer = NULL, *unicode;
2936 Py_buffer info;
2937 char lower[11]; /* Enough for any encoding shortcut */
2938
Fred Drakee4315f52000-05-09 19:53:39 +00002939 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002940 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002941 if ((strcmp(lower, "utf-8") == 0) ||
2942 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002943 return PyUnicode_DecodeUTF8(s, size, errors);
2944 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002945 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002946 (strcmp(lower, "iso-8859-1") == 0))
2947 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002948#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002949 else if (strcmp(lower, "mbcs") == 0)
2950 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002951#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002952 else if (strcmp(lower, "ascii") == 0)
2953 return PyUnicode_DecodeASCII(s, size, errors);
2954 else if (strcmp(lower, "utf-16") == 0)
2955 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2956 else if (strcmp(lower, "utf-32") == 0)
2957 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959
2960 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002961 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002962 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002963 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002964 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 if (buffer == NULL)
2966 goto onError;
2967 unicode = PyCodec_Decode(buffer, encoding, errors);
2968 if (unicode == NULL)
2969 goto onError;
2970 if (!PyUnicode_Check(unicode)) {
2971 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002972 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002973 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 Py_DECREF(unicode);
2975 goto onError;
2976 }
2977 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002978 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002979
Benjamin Peterson29060642009-01-31 22:14:21 +00002980 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 Py_XDECREF(buffer);
2982 return NULL;
2983}
2984
Alexander Belopolsky40018472011-02-26 01:02:56 +00002985PyObject *
2986PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002987 const char *encoding,
2988 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002989{
2990 PyObject *v;
2991
2992 if (!PyUnicode_Check(unicode)) {
2993 PyErr_BadArgument();
2994 goto onError;
2995 }
2996
2997 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002998 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002999
3000 /* Decode via the codec registry */
3001 v = PyCodec_Decode(unicode, encoding, errors);
3002 if (v == NULL)
3003 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003004 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003005
Benjamin Peterson29060642009-01-31 22:14:21 +00003006 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003007 return NULL;
3008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
3011PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003012 const char *encoding,
3013 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003014{
3015 PyObject *v;
3016
3017 if (!PyUnicode_Check(unicode)) {
3018 PyErr_BadArgument();
3019 goto onError;
3020 }
3021
3022 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003023 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003024
3025 /* Decode via the codec registry */
3026 v = PyCodec_Decode(unicode, encoding, errors);
3027 if (v == NULL)
3028 goto onError;
3029 if (!PyUnicode_Check(v)) {
3030 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003031 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003032 Py_TYPE(v)->tp_name);
3033 Py_DECREF(v);
3034 goto onError;
3035 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003036 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037
Benjamin Peterson29060642009-01-31 22:14:21 +00003038 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039 return NULL;
3040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
3043PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003044 Py_ssize_t size,
3045 const char *encoding,
3046 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
3048 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003049
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 unicode = PyUnicode_FromUnicode(s, size);
3051 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3054 Py_DECREF(unicode);
3055 return v;
3056}
3057
Alexander Belopolsky40018472011-02-26 01:02:56 +00003058PyObject *
3059PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003060 const char *encoding,
3061 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003062{
3063 PyObject *v;
3064
3065 if (!PyUnicode_Check(unicode)) {
3066 PyErr_BadArgument();
3067 goto onError;
3068 }
3069
3070 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003072
3073 /* Encode via the codec registry */
3074 v = PyCodec_Encode(unicode, encoding, errors);
3075 if (v == NULL)
3076 goto onError;
3077 return v;
3078
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003080 return NULL;
3081}
3082
Victor Stinnerad158722010-10-27 00:25:46 +00003083PyObject *
3084PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003085{
Victor Stinner99b95382011-07-04 14:23:54 +02003086#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003087 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003088#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003089 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003090#else
Victor Stinner793b5312011-04-27 00:24:21 +02003091 PyInterpreterState *interp = PyThreadState_GET()->interp;
3092 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3093 cannot use it to encode and decode filenames before it is loaded. Load
3094 the Python codec requires to encode at least its own filename. Use the C
3095 version of the locale codec until the codec registry is initialized and
3096 the Python codec is loaded.
3097
3098 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3099 cannot only rely on it: check also interp->fscodec_initialized for
3100 subinterpreters. */
3101 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003102 return PyUnicode_AsEncodedString(unicode,
3103 Py_FileSystemDefaultEncoding,
3104 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003105 }
3106 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003107 /* locale encoding with surrogateescape */
3108 wchar_t *wchar;
3109 char *bytes;
3110 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003111 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003112
3113 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3114 if (wchar == NULL)
3115 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003116 bytes = _Py_wchar2char(wchar, &error_pos);
3117 if (bytes == NULL) {
3118 if (error_pos != (size_t)-1) {
3119 char *errmsg = strerror(errno);
3120 PyObject *exc = NULL;
3121 if (errmsg == NULL)
3122 errmsg = "Py_wchar2char() failed";
3123 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003124 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003125 error_pos, error_pos+1,
3126 errmsg);
3127 Py_XDECREF(exc);
3128 }
3129 else
3130 PyErr_NoMemory();
3131 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003132 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003133 }
3134 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003135
3136 bytes_obj = PyBytes_FromString(bytes);
3137 PyMem_Free(bytes);
3138 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003139 }
Victor Stinnerad158722010-10-27 00:25:46 +00003140#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003141}
3142
Alexander Belopolsky40018472011-02-26 01:02:56 +00003143PyObject *
3144PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003145 const char *encoding,
3146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147{
3148 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003149 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003150
Guido van Rossumd57fd912000-03-10 22:53:23 +00003151 if (!PyUnicode_Check(unicode)) {
3152 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 }
Fred Drakee4315f52000-05-09 19:53:39 +00003155
Fred Drakee4315f52000-05-09 19:53:39 +00003156 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003157 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003158 if ((strcmp(lower, "utf-8") == 0) ||
3159 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003160 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003161 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003163 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003164 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003165 }
Victor Stinner37296e82010-06-10 13:36:23 +00003166 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003167 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003168 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003170#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003171 else if (strcmp(lower, "mbcs") == 0)
3172 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003173#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003174 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003175 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177
3178 /* Encode via the codec registry */
3179 v = PyCodec_Encode(unicode, encoding, errors);
3180 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003181 return NULL;
3182
3183 /* The normal path */
3184 if (PyBytes_Check(v))
3185 return v;
3186
3187 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003188 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003189 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003190 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003191
3192 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3193 "encoder %s returned bytearray instead of bytes",
3194 encoding);
3195 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003196 Py_DECREF(v);
3197 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003198 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003200 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3201 Py_DECREF(v);
3202 return b;
3203 }
3204
3205 PyErr_Format(PyExc_TypeError,
3206 "encoder did not return a bytes object (type=%.400s)",
3207 Py_TYPE(v)->tp_name);
3208 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003209 return NULL;
3210}
3211
Alexander Belopolsky40018472011-02-26 01:02:56 +00003212PyObject *
3213PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003214 const char *encoding,
3215 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003216{
3217 PyObject *v;
3218
3219 if (!PyUnicode_Check(unicode)) {
3220 PyErr_BadArgument();
3221 goto onError;
3222 }
3223
3224 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003226
3227 /* Encode via the codec registry */
3228 v = PyCodec_Encode(unicode, encoding, errors);
3229 if (v == NULL)
3230 goto onError;
3231 if (!PyUnicode_Check(v)) {
3232 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003233 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003234 Py_TYPE(v)->tp_name);
3235 Py_DECREF(v);
3236 goto onError;
3237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 return NULL;
3242}
3243
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003244PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003245PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003246 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003247 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3248}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003249
Christian Heimes5894ba72007-11-04 11:43:14 +00003250PyObject*
3251PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3252{
Victor Stinner99b95382011-07-04 14:23:54 +02003253#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003254 return PyUnicode_DecodeMBCS(s, size, NULL);
3255#elif defined(__APPLE__)
3256 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3257#else
Victor Stinner793b5312011-04-27 00:24:21 +02003258 PyInterpreterState *interp = PyThreadState_GET()->interp;
3259 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3260 cannot use it to encode and decode filenames before it is loaded. Load
3261 the Python codec requires to encode at least its own filename. Use the C
3262 version of the locale codec until the codec registry is initialized and
3263 the Python codec is loaded.
3264
3265 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3266 cannot only rely on it: check also interp->fscodec_initialized for
3267 subinterpreters. */
3268 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003269 return PyUnicode_Decode(s, size,
3270 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003271 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003272 }
3273 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003274 /* locale encoding with surrogateescape */
3275 wchar_t *wchar;
3276 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003277 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003278
3279 if (s[size] != '\0' || size != strlen(s)) {
3280 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3281 return NULL;
3282 }
3283
Victor Stinner168e1172010-10-16 23:16:16 +00003284 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003285 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003286 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287
Victor Stinner168e1172010-10-16 23:16:16 +00003288 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003289 PyMem_Free(wchar);
3290 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003291 }
Victor Stinnerad158722010-10-27 00:25:46 +00003292#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003293}
3294
Martin v. Löwis011e8422009-05-05 04:43:17 +00003295
3296int
3297PyUnicode_FSConverter(PyObject* arg, void* addr)
3298{
3299 PyObject *output = NULL;
3300 Py_ssize_t size;
3301 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003302 if (arg == NULL) {
3303 Py_DECREF(*(PyObject**)addr);
3304 return 1;
3305 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003306 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003307 output = arg;
3308 Py_INCREF(output);
3309 }
3310 else {
3311 arg = PyUnicode_FromObject(arg);
3312 if (!arg)
3313 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003314 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003315 Py_DECREF(arg);
3316 if (!output)
3317 return 0;
3318 if (!PyBytes_Check(output)) {
3319 Py_DECREF(output);
3320 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3321 return 0;
3322 }
3323 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003324 size = PyBytes_GET_SIZE(output);
3325 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003326 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003327 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003328 Py_DECREF(output);
3329 return 0;
3330 }
3331 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003332 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003333}
3334
3335
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003336int
3337PyUnicode_FSDecoder(PyObject* arg, void* addr)
3338{
3339 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003340 if (arg == NULL) {
3341 Py_DECREF(*(PyObject**)addr);
3342 return 1;
3343 }
3344 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345 if (PyUnicode_READY(arg))
3346 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003347 output = arg;
3348 Py_INCREF(output);
3349 }
3350 else {
3351 arg = PyBytes_FromObject(arg);
3352 if (!arg)
3353 return 0;
3354 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3355 PyBytes_GET_SIZE(arg));
3356 Py_DECREF(arg);
3357 if (!output)
3358 return 0;
3359 if (!PyUnicode_Check(output)) {
3360 Py_DECREF(output);
3361 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3362 return 0;
3363 }
3364 }
Victor Stinner065836e2011-10-27 01:56:33 +02003365 if (PyUnicode_READY(output) < 0) {
3366 Py_DECREF(output);
3367 return 0;
3368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003370 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003371 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3372 Py_DECREF(output);
3373 return 0;
3374 }
3375 *(PyObject**)addr = output;
3376 return Py_CLEANUP_SUPPORTED;
3377}
3378
3379
Martin v. Löwis5b222132007-06-10 09:51:05 +00003380char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003381PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003382{
Christian Heimesf3863112007-11-22 07:46:41 +00003383 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003384
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003385 if (!PyUnicode_Check(unicode)) {
3386 PyErr_BadArgument();
3387 return NULL;
3388 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003389 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003392 if (PyUnicode_UTF8(unicode) == NULL) {
3393 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003394 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3395 if (bytes == NULL)
3396 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003397 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3398 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003399 Py_DECREF(bytes);
3400 return NULL;
3401 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003402 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3403 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3404 PyBytes_AS_STRING(bytes),
3405 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003406 Py_DECREF(bytes);
3407 }
3408
3409 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003410 *psize = PyUnicode_UTF8_LENGTH(unicode);
3411 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003412}
3413
3414char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3418}
3419
3420#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003421static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003422#endif
3423
3424
3425Py_UNICODE *
3426PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003428 const unsigned char *one_byte;
3429#if SIZEOF_WCHAR_T == 4
3430 const Py_UCS2 *two_bytes;
3431#else
3432 const Py_UCS4 *four_bytes;
3433 const Py_UCS4 *ucs4_end;
3434 Py_ssize_t num_surrogates;
3435#endif
3436 wchar_t *w;
3437 wchar_t *wchar_end;
3438
3439 if (!PyUnicode_Check(unicode)) {
3440 PyErr_BadArgument();
3441 return NULL;
3442 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003444 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003445 assert(_PyUnicode_KIND(unicode) != 0);
3446 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003447
3448#ifdef Py_DEBUG
3449 ++unicode_as_unicode_calls;
3450#endif
3451
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3455 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456 num_surrogates = 0;
3457
3458 for (; four_bytes < ucs4_end; ++four_bytes) {
3459 if (*four_bytes > 0xFFFF)
3460 ++num_surrogates;
3461 }
3462
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003463 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3464 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3465 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 PyErr_NoMemory();
3467 return NULL;
3468 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003471 w = _PyUnicode_WSTR(unicode);
3472 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3473 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3475 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003476 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003477 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003478 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3479 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003480 }
3481 else
3482 *w = *four_bytes;
3483
3484 if (w > wchar_end) {
3485 assert(0 && "Miscalculated string end");
3486 }
3487 }
3488 *w = 0;
3489#else
3490 /* sizeof(wchar_t) == 4 */
3491 Py_FatalError("Impossible unicode object state, wstr and str "
3492 "should share memory already.");
3493 return NULL;
3494#endif
3495 }
3496 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003497 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3498 (_PyUnicode_LENGTH(unicode) + 1));
3499 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500 PyErr_NoMemory();
3501 return NULL;
3502 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003503 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3504 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3505 w = _PyUnicode_WSTR(unicode);
3506 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3509 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003510 for (; w < wchar_end; ++one_byte, ++w)
3511 *w = *one_byte;
3512 /* null-terminate the wstr */
3513 *w = 0;
3514 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003515 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003517 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 for (; w < wchar_end; ++two_bytes, ++w)
3519 *w = *two_bytes;
3520 /* null-terminate the wstr */
3521 *w = 0;
3522#else
3523 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003524 PyObject_FREE(_PyUnicode_WSTR(unicode));
3525 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 Py_FatalError("Impossible unicode object state, wstr "
3527 "and str should share memory already.");
3528 return NULL;
3529#endif
3530 }
3531 else {
3532 assert(0 && "This should never happen.");
3533 }
3534 }
3535 }
3536 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003537 *size = PyUnicode_WSTR_LENGTH(unicode);
3538 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003539}
3540
Alexander Belopolsky40018472011-02-26 01:02:56 +00003541Py_UNICODE *
3542PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003544 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545}
3546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548Py_ssize_t
3549PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550{
3551 if (!PyUnicode_Check(unicode)) {
3552 PyErr_BadArgument();
3553 goto onError;
3554 }
3555 return PyUnicode_GET_SIZE(unicode);
3556
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 return -1;
3559}
3560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003561Py_ssize_t
3562PyUnicode_GetLength(PyObject *unicode)
3563{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003564 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003565 PyErr_BadArgument();
3566 return -1;
3567 }
3568
3569 return PyUnicode_GET_LENGTH(unicode);
3570}
3571
3572Py_UCS4
3573PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3574{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003575 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3576 PyErr_BadArgument();
3577 return (Py_UCS4)-1;
3578 }
3579 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3580 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003581 return (Py_UCS4)-1;
3582 }
3583 return PyUnicode_READ_CHAR(unicode, index);
3584}
3585
3586int
3587PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3588{
3589 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003590 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003591 return -1;
3592 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003593 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3594 PyErr_SetString(PyExc_IndexError, "string index out of range");
3595 return -1;
3596 }
3597 if (_PyUnicode_Dirty(unicode))
3598 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3600 index, ch);
3601 return 0;
3602}
3603
Alexander Belopolsky40018472011-02-26 01:02:56 +00003604const char *
3605PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003606{
Victor Stinner42cb4622010-09-01 19:39:01 +00003607 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003608}
3609
Victor Stinner554f3f02010-06-16 23:33:54 +00003610/* create or adjust a UnicodeDecodeError */
3611static void
3612make_decode_exception(PyObject **exceptionObject,
3613 const char *encoding,
3614 const char *input, Py_ssize_t length,
3615 Py_ssize_t startpos, Py_ssize_t endpos,
3616 const char *reason)
3617{
3618 if (*exceptionObject == NULL) {
3619 *exceptionObject = PyUnicodeDecodeError_Create(
3620 encoding, input, length, startpos, endpos, reason);
3621 }
3622 else {
3623 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3624 goto onError;
3625 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3626 goto onError;
3627 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3628 goto onError;
3629 }
3630 return;
3631
3632onError:
3633 Py_DECREF(*exceptionObject);
3634 *exceptionObject = NULL;
3635}
3636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637/* error handling callback helper:
3638 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003639 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 and adjust various state variables.
3641 return 0 on success, -1 on error
3642*/
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644static int
3645unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 const char *encoding, const char *reason,
3647 const char **input, const char **inend, Py_ssize_t *startinpos,
3648 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003649 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003651 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003652
3653 PyObject *restuple = NULL;
3654 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003655 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003656 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003657 Py_ssize_t requiredsize;
3658 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003659 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 int res = -1;
3661
Victor Stinner596a6c42011-11-09 00:02:18 +01003662 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3663 outsize = PyUnicode_GET_LENGTH(*output);
3664 else
3665 outsize = _PyUnicode_WSTR_LENGTH(*output);
3666
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 *errorHandler = PyCodec_LookupError(errors);
3669 if (*errorHandler == NULL)
3670 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003671 }
3672
Victor Stinner554f3f02010-06-16 23:33:54 +00003673 make_decode_exception(exceptionObject,
3674 encoding,
3675 *input, *inend - *input,
3676 *startinpos, *endinpos,
3677 reason);
3678 if (*exceptionObject == NULL)
3679 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003680
3681 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3682 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003685 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 }
3688 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003690 if (PyUnicode_READY(repunicode) < 0)
3691 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003692
3693 /* Copy back the bytes variables, which might have been modified by the
3694 callback */
3695 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3696 if (!inputobj)
3697 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003698 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003700 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003701 *input = PyBytes_AS_STRING(inputobj);
3702 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003703 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003704 /* we can DECREF safely, as the exception has another reference,
3705 so the object won't go away. */
3706 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003710 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003711 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3712 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714
Victor Stinner596a6c42011-11-09 00:02:18 +01003715 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3716 /* need more space? (at least enough for what we
3717 have+the replacement+the rest of the string (starting
3718 at the new input position), so we won't have to check space
3719 when there are no errors in the rest of the string) */
3720 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3721 requiredsize = *outpos + replen + insize-newpos;
3722 if (requiredsize > outsize) {
3723 if (requiredsize<2*outsize)
3724 requiredsize = 2*outsize;
3725 if (unicode_resize(output, requiredsize) < 0)
3726 goto onError;
3727 }
3728 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003730 copy_characters(*output, *outpos, repunicode, 0, replen);
3731 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003733 else {
3734 wchar_t *repwstr;
3735 Py_ssize_t repwlen;
3736 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3737 if (repwstr == NULL)
3738 goto onError;
3739 /* need more space? (at least enough for what we
3740 have+the replacement+the rest of the string (starting
3741 at the new input position), so we won't have to check space
3742 when there are no errors in the rest of the string) */
3743 requiredsize = *outpos + repwlen + insize-newpos;
3744 if (requiredsize > outsize) {
3745 if (requiredsize < 2*outsize)
3746 requiredsize = 2*outsize;
3747 if (unicode_resize(output, requiredsize) < 0)
3748 goto onError;
3749 }
3750 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3751 *outpos += repwlen;
3752 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003753 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003754 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003756 /* we made it! */
3757 res = 0;
3758
Benjamin Peterson29060642009-01-31 22:14:21 +00003759 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 Py_XDECREF(restuple);
3761 return res;
3762}
3763
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003764/* --- UTF-7 Codec -------------------------------------------------------- */
3765
Antoine Pitrou244651a2009-05-04 18:56:13 +00003766/* See RFC2152 for details. We encode conservatively and decode liberally. */
3767
3768/* Three simple macros defining base-64. */
3769
3770/* Is c a base-64 character? */
3771
3772#define IS_BASE64(c) \
3773 (((c) >= 'A' && (c) <= 'Z') || \
3774 ((c) >= 'a' && (c) <= 'z') || \
3775 ((c) >= '0' && (c) <= '9') || \
3776 (c) == '+' || (c) == '/')
3777
3778/* given that c is a base-64 character, what is its base-64 value? */
3779
3780#define FROM_BASE64(c) \
3781 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3782 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3783 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3784 (c) == '+' ? 62 : 63)
3785
3786/* What is the base-64 character of the bottom 6 bits of n? */
3787
3788#define TO_BASE64(n) \
3789 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3790
3791/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3792 * decoded as itself. We are permissive on decoding; the only ASCII
3793 * byte not decoding to itself is the + which begins a base64
3794 * string. */
3795
3796#define DECODE_DIRECT(c) \
3797 ((c) <= 127 && (c) != '+')
3798
3799/* The UTF-7 encoder treats ASCII characters differently according to
3800 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3801 * the above). See RFC2152. This array identifies these different
3802 * sets:
3803 * 0 : "Set D"
3804 * alphanumeric and '(),-./:?
3805 * 1 : "Set O"
3806 * !"#$%&*;<=>@[]^_`{|}
3807 * 2 : "whitespace"
3808 * ht nl cr sp
3809 * 3 : special (must be base64 encoded)
3810 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3811 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003812
Tim Petersced69f82003-09-16 20:30:58 +00003813static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003814char utf7_category[128] = {
3815/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3816 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3817/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3818 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3819/* sp ! " # $ % & ' ( ) * + , - . / */
3820 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3821/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3823/* @ A B C D E F G H I J K L M N O */
3824 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3825/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3827/* ` a b c d e f g h i j k l m n o */
3828 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3829/* p q r s t u v w x y z { | } ~ del */
3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831};
3832
Antoine Pitrou244651a2009-05-04 18:56:13 +00003833/* ENCODE_DIRECT: this character should be encoded as itself. The
3834 * answer depends on whether we are encoding set O as itself, and also
3835 * on whether we are encoding whitespace as itself. RFC2152 makes it
3836 * clear that the answers to these questions vary between
3837 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003838
Antoine Pitrou244651a2009-05-04 18:56:13 +00003839#define ENCODE_DIRECT(c, directO, directWS) \
3840 ((c) < 128 && (c) > 0 && \
3841 ((utf7_category[(c)] == 0) || \
3842 (directWS && (utf7_category[(c)] == 2)) || \
3843 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003844
Alexander Belopolsky40018472011-02-26 01:02:56 +00003845PyObject *
3846PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003847 Py_ssize_t size,
3848 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003850 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3851}
3852
Antoine Pitrou244651a2009-05-04 18:56:13 +00003853/* The decoder. The only state we preserve is our read position,
3854 * i.e. how many characters we have consumed. So if we end in the
3855 * middle of a shift sequence we have to back off the read position
3856 * and the output to the beginning of the sequence, otherwise we lose
3857 * all the shift state (seen bits, number of bits seen, high
3858 * surrogate). */
3859
Alexander Belopolsky40018472011-02-26 01:02:56 +00003860PyObject *
3861PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003862 Py_ssize_t size,
3863 const char *errors,
3864 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003865{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t startinpos;
3868 Py_ssize_t endinpos;
3869 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003871 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003872 const char *errmsg = "";
3873 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003874 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 unsigned int base64bits = 0;
3876 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003877 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 PyObject *errorHandler = NULL;
3879 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003881 /* Start off assuming it's all ASCII. Widen later as necessary. */
3882 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003883 if (!unicode)
3884 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003885 if (size == 0) {
3886 if (consumed)
3887 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003888 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003889 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003891 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 e = s + size;
3893
3894 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003895 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003896 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003897 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898
Antoine Pitrou244651a2009-05-04 18:56:13 +00003899 if (inShift) { /* in a base-64 section */
3900 if (IS_BASE64(ch)) { /* consume a base-64 character */
3901 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3902 base64bits += 6;
3903 s++;
3904 if (base64bits >= 16) {
3905 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003906 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 base64bits -= 16;
3908 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3909 if (surrogate) {
3910 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003911 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3912 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003913 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3914 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003916 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003917 }
3918 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003919 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3920 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 }
3923 }
Victor Stinner551ac952011-11-29 22:58:13 +01003924 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003925 /* first surrogate */
3926 surrogate = outCh;
3927 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003928 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003929 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3930 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 }
3932 }
3933 }
3934 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003935 inShift = 0;
3936 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003938 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3939 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003940 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942 if (base64bits > 0) { /* left-over bits */
3943 if (base64bits >= 6) {
3944 /* We've seen at least one base-64 character */
3945 errmsg = "partial character in shift sequence";
3946 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003947 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 else {
3949 /* Some bits remain; they should be zero */
3950 if (base64buffer != 0) {
3951 errmsg = "non-zero padding bits in shift sequence";
3952 goto utf7Error;
3953 }
3954 }
3955 }
3956 if (ch != '-') {
3957 /* '-' is absorbed; other terminating
3958 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003959 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3960 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 }
3963 }
3964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003966 s++; /* consume '+' */
3967 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003969 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3970 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003971 }
3972 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003973 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003974 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003976 }
3977 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003978 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003979 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3980 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003981 s++;
3982 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003983 else {
3984 startinpos = s-starts;
3985 s++;
3986 errmsg = "unexpected special character";
3987 goto utf7Error;
3988 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003990utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 endinpos = s-starts;
3992 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 errors, &errorHandler,
3994 "utf7", errmsg,
3995 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003996 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 }
3999
Antoine Pitrou244651a2009-05-04 18:56:13 +00004000 /* end of string */
4001
4002 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4003 /* if we're in an inconsistent state, that's an error */
4004 if (surrogate ||
4005 (base64bits >= 6) ||
4006 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 endinpos = size;
4008 if (unicode_decode_call_errorhandler(
4009 errors, &errorHandler,
4010 "utf7", "unterminated shift sequence",
4011 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004012 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004013 goto onError;
4014 if (s < e)
4015 goto restart;
4016 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004018
4019 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004020 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004022 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004023 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 }
4025 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004026 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004028 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004029
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004030 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004031 goto onError;
4032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004033 Py_XDECREF(errorHandler);
4034 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004035 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004038 Py_XDECREF(errorHandler);
4039 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 Py_DECREF(unicode);
4041 return NULL;
4042}
4043
4044
Alexander Belopolsky40018472011-02-26 01:02:56 +00004045PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004046_PyUnicode_EncodeUTF7(PyObject *str,
4047 int base64SetO,
4048 int base64WhiteSpace,
4049 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004050{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004051 int kind;
4052 void *data;
4053 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004054 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004056 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004057 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004058 unsigned int base64bits = 0;
4059 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004060 char * out;
4061 char * start;
4062
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004063 if (PyUnicode_READY(str) < 0)
4064 return NULL;
4065 kind = PyUnicode_KIND(str);
4066 data = PyUnicode_DATA(str);
4067 len = PyUnicode_GET_LENGTH(str);
4068
4069 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004072 /* It might be possible to tighten this worst case */
4073 allocated = 8 * len;
4074 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004075 return PyErr_NoMemory();
4076
Antoine Pitrou244651a2009-05-04 18:56:13 +00004077 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078 if (v == NULL)
4079 return NULL;
4080
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004081 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004082 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004083 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084
Antoine Pitrou244651a2009-05-04 18:56:13 +00004085 if (inShift) {
4086 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4087 /* shifting out */
4088 if (base64bits) { /* output remaining bits */
4089 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4090 base64buffer = 0;
4091 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004092 }
4093 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004094 /* Characters not in the BASE64 set implicitly unshift the sequence
4095 so no '-' is required, except if the character is itself a '-' */
4096 if (IS_BASE64(ch) || ch == '-') {
4097 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004098 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004099 *out++ = (char) ch;
4100 }
4101 else {
4102 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004103 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004104 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004105 else { /* not in a shift sequence */
4106 if (ch == '+') {
4107 *out++ = '+';
4108 *out++ = '-';
4109 }
4110 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4111 *out++ = (char) ch;
4112 }
4113 else {
4114 *out++ = '+';
4115 inShift = 1;
4116 goto encode_char;
4117 }
4118 }
4119 continue;
4120encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004121 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004122 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004123
Antoine Pitrou244651a2009-05-04 18:56:13 +00004124 /* code first surrogate */
4125 base64bits += 16;
4126 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4127 while (base64bits >= 6) {
4128 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4129 base64bits -= 6;
4130 }
4131 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004132 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004134 base64bits += 16;
4135 base64buffer = (base64buffer << 16) | ch;
4136 while (base64bits >= 6) {
4137 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4138 base64bits -= 6;
4139 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 if (base64bits)
4142 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4143 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004144 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004145 if (_PyBytes_Resize(&v, out - start) < 0)
4146 return NULL;
4147 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004148}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004149PyObject *
4150PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4151 Py_ssize_t size,
4152 int base64SetO,
4153 int base64WhiteSpace,
4154 const char *errors)
4155{
4156 PyObject *result;
4157 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4158 if (tmp == NULL)
4159 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004160 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004161 base64WhiteSpace, errors);
4162 Py_DECREF(tmp);
4163 return result;
4164}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165
Antoine Pitrou244651a2009-05-04 18:56:13 +00004166#undef IS_BASE64
4167#undef FROM_BASE64
4168#undef TO_BASE64
4169#undef DECODE_DIRECT
4170#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172/* --- UTF-8 Codec -------------------------------------------------------- */
4173
Tim Petersced69f82003-09-16 20:30:58 +00004174static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004176 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4177 illegal prefix. See RFC 3629 for details */
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4190 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4191 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4192 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4193 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194};
4195
Alexander Belopolsky40018472011-02-26 01:02:56 +00004196PyObject *
4197PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004198 Py_ssize_t size,
4199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200{
Walter Dörwald69652032004-09-07 20:24:22 +00004201 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4202}
4203
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004204#include "stringlib/ucs1lib.h"
4205#include "stringlib/codecs.h"
4206#include "stringlib/undef.h"
4207
4208#include "stringlib/ucs2lib.h"
4209#include "stringlib/codecs.h"
4210#include "stringlib/undef.h"
4211
4212#include "stringlib/ucs4lib.h"
4213#include "stringlib/codecs.h"
4214#include "stringlib/undef.h"
4215
Antoine Pitrouab868312009-01-10 15:40:25 +00004216/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4217#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4218
4219/* Mask to quickly check whether a C 'long' contains a
4220 non-ASCII, UTF8-encoded char. */
4221#if (SIZEOF_LONG == 8)
4222# define ASCII_CHAR_MASK 0x8080808080808080L
4223#elif (SIZEOF_LONG == 4)
4224# define ASCII_CHAR_MASK 0x80808080L
4225#else
4226# error C 'long' size should be either 4 or 8!
4227#endif
4228
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004229/* Scans a UTF-8 string and returns the maximum character to be expected
4230 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004232 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004234 */
4235static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4237 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 const unsigned char *p = (const unsigned char *)s;
4241 const unsigned char *end = p + string_size;
4242 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004244 assert(unicode_size != NULL);
4245
4246 /* By having a cascade of independent loops which fallback onto each
4247 other, we minimize the amount of work done in the average loop
4248 iteration, and we also maximize the CPU's ability to predict
4249 branches correctly (because a given condition will have always the
4250 same boolean outcome except perhaps in the last iteration of the
4251 corresponding loop).
4252 In the general case this brings us rather close to decoding
4253 performance pre-PEP 393, despite the two-pass decoding.
4254
4255 Note that the pure ASCII loop is not duplicated once a non-ASCII
4256 character has been encountered. It is actually a pessimization (by
4257 a significant factor) to use this loop on text with many non-ASCII
4258 characters, and it is important to avoid bad performance on valid
4259 utf-8 data (invalid utf-8 being a different can of worms).
4260 */
4261
4262 /* ASCII */
4263 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004264 /* Only check value if it's not a ASCII char... */
4265 if (*p < 0x80) {
4266 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4267 an explanation. */
4268 if (!((size_t) p & LONG_PTR_MASK)) {
4269 /* Help register allocation */
4270 register const unsigned char *_p = p;
4271 while (_p < aligned_end) {
4272 unsigned long value = *(unsigned long *) _p;
4273 if (value & ASCII_CHAR_MASK)
4274 break;
4275 _p += SIZEOF_LONG;
4276 char_count += SIZEOF_LONG;
4277 }
4278 p = _p;
4279 if (p == end)
4280 break;
4281 }
4282 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004283 if (*p < 0x80)
4284 ++char_count;
4285 else
4286 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004287 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004288 *unicode_size = char_count;
4289 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004291_ucs1loop:
4292 for (; p < end; ++p) {
4293 if (*p < 0xc4)
4294 char_count += ((*p & 0xc0) != 0x80);
4295 else
4296 goto _ucs2loop;
4297 }
4298 *unicode_size = char_count;
4299 return 255;
4300
4301_ucs2loop:
4302 for (; p < end; ++p) {
4303 if (*p < 0xf0)
4304 char_count += ((*p & 0xc0) != 0x80);
4305 else
4306 goto _ucs4loop;
4307 }
4308 *unicode_size = char_count;
4309 return 65535;
4310
4311_ucs4loop:
4312 for (; p < end; ++p) {
4313 char_count += ((*p & 0xc0) != 0x80);
4314 }
4315 *unicode_size = char_count;
4316 return 65537;
4317}
4318
4319/* Called when we encountered some error that wasn't detected in the original
4320 scan, e.g. an encoded surrogate character. The original maxchar computation
4321 may have been incorrect, so redo it. */
4322static int
4323refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4324{
4325 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004326 Py_ssize_t k;
4327 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004328 for (k = 0, maxchar = 0; k < n; k++)
4329 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4330 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4331 if (tmp == NULL)
4332 return -1;
4333 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4334 Py_DECREF(*unicode);
4335 *unicode = tmp;
4336 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004337}
4338
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004339/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4340 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4341 onError. Potential resizing overallocates, so the result needs to shrink
4342 at the end.
4343*/
4344#define WRITE_MAYBE_FAIL(index, value) \
4345 do { \
4346 if (has_errors) { \
4347 Py_ssize_t pos = index; \
4348 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4349 unicode_resize(&unicode, pos + pos/8) < 0) \
4350 goto onError; \
4351 if (unicode_putchar(&unicode, &pos, value) < 0) \
4352 goto onError; \
4353 } \
4354 else \
4355 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 } while (0)
4357
Alexander Belopolsky40018472011-02-26 01:02:56 +00004358PyObject *
4359PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 Py_ssize_t size,
4361 const char *errors,
4362 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004363{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004364 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004366 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004367 Py_ssize_t startinpos;
4368 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004369 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004370 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004371 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372 PyObject *errorHandler = NULL;
4373 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004374 Py_UCS4 maxchar = 0;
4375 Py_ssize_t unicode_size;
4376 Py_ssize_t i;
4377 int kind;
4378 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004379 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380
Walter Dörwald69652032004-09-07 20:24:22 +00004381 if (size == 0) {
4382 if (consumed)
4383 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004384 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004385 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004386 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004387 /* When the string is ASCII only, just use memcpy and return.
4388 unicode_size may be != size if there is an incomplete UTF-8
4389 sequence at the end of the ASCII block. */
4390 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004391 if (consumed)
4392 *consumed = size;
4393
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004394 if (size == 1)
4395 return get_latin1_char((unsigned char)s[0]);
4396
4397 unicode = PyUnicode_New(unicode_size, maxchar);
4398 if (!unicode)
4399 return NULL;
4400 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4401 assert(_PyUnicode_CheckConsistency(unicode, 1));
4402 return unicode;
4403 }
4404
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004405 /* In case of errors, maxchar and size computation might be incorrect;
4406 code below refits and resizes as necessary. */
4407 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 if (!unicode)
4409 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004410 kind = PyUnicode_KIND(unicode);
4411 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004412
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004414 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004416 switch (kind) {
4417 case PyUnicode_1BYTE_KIND:
4418 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4419 break;
4420 case PyUnicode_2BYTE_KIND:
4421 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4422 break;
4423 case PyUnicode_4BYTE_KIND:
4424 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4425 break;
4426 }
4427 if (!has_errors) {
4428 /* Ensure the unicode size calculation was correct */
4429 assert(i == unicode_size);
4430 assert(s == e);
4431 if (consumed)
4432 *consumed = s-starts;
4433 return unicode;
4434 }
4435 /* Fall through to the generic decoding loop for the rest of
4436 the string */
4437 if (refit_partial_string(&unicode, kind, data, i) < 0)
4438 goto onError;
4439
Antoine Pitrouab868312009-01-10 15:40:25 +00004440 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441
4442 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004443 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444
4445 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004446 /* Fast path for runs of ASCII characters. Given that common UTF-8
4447 input will consist of an overwhelming majority of ASCII
4448 characters, we try to optimize for this case by checking
4449 as many characters as a C 'long' can contain.
4450 First, check if we can do an aligned read, as most CPUs have
4451 a penalty for unaligned reads.
4452 */
4453 if (!((size_t) s & LONG_PTR_MASK)) {
4454 /* Help register allocation */
4455 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004456 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004457 while (_s < aligned_end) {
4458 /* Read a whole long at a time (either 4 or 8 bytes),
4459 and do a fast unrolled copy if it only contains ASCII
4460 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004461 unsigned long value = *(unsigned long *) _s;
4462 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004463 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004464 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4465 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4466 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4467 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004468#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004469 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4470 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4471 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4472 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004473#endif
4474 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004476 }
4477 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004479 if (s == e)
4480 break;
4481 ch = (unsigned char)*s;
4482 }
4483 }
4484
4485 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004486 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 s++;
4488 continue;
4489 }
4490
4491 n = utf8_code_length[ch];
4492
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004493 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 if (consumed)
4495 break;
4496 else {
4497 errmsg = "unexpected end of data";
4498 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004499 endinpos = startinpos+1;
4500 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4501 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 goto utf8Error;
4503 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505
4506 switch (n) {
4507
4508 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004509 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 startinpos = s-starts;
4511 endinpos = startinpos+1;
4512 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004513
4514 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004515 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 startinpos = s-starts;
4517 endinpos = startinpos+1;
4518 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
4520 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004521 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004522 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004524 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 goto utf8Error;
4526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004527 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004528 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004529 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 break;
4531
4532 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004533 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4534 will result in surrogates in range d800-dfff. Surrogates are
4535 not valid UTF-8 so they are rejected.
4536 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4537 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004538 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004539 (s[2] & 0xc0) != 0x80 ||
4540 ((unsigned char)s[0] == 0xE0 &&
4541 (unsigned char)s[1] < 0xA0) ||
4542 ((unsigned char)s[0] == 0xED &&
4543 (unsigned char)s[1] > 0x9F)) {
4544 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004546 endinpos = startinpos + 1;
4547
4548 /* if s[1] first two bits are 1 and 0, then the invalid
4549 continuation byte is s[2], so increment endinpos by 1,
4550 if not, s[1] is invalid and endinpos doesn't need to
4551 be incremented. */
4552 if ((s[1] & 0xC0) == 0x80)
4553 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004554 goto utf8Error;
4555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004557 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004558 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004559 break;
4560
4561 case 4:
4562 if ((s[1] & 0xc0) != 0x80 ||
4563 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004564 (s[3] & 0xc0) != 0x80 ||
4565 ((unsigned char)s[0] == 0xF0 &&
4566 (unsigned char)s[1] < 0x90) ||
4567 ((unsigned char)s[0] == 0xF4 &&
4568 (unsigned char)s[1] > 0x8F)) {
4569 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004571 endinpos = startinpos + 1;
4572 if ((s[1] & 0xC0) == 0x80) {
4573 endinpos++;
4574 if ((s[2] & 0xC0) == 0x80)
4575 endinpos++;
4576 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 goto utf8Error;
4578 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004579 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004580 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004581 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004582
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004583 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585 }
4586 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004588
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004590 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004591 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004592 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004593 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004594 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 if (unicode_decode_call_errorhandler(
4596 errors, &errorHandler,
4597 "utf8", errmsg,
4598 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004600 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004601 /* Update data because unicode_decode_call_errorhandler might have
4602 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004603 data = PyUnicode_DATA(unicode);
4604 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004607 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004608 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004609
Walter Dörwald69652032004-09-07 20:24:22 +00004610 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613 /* Adjust length and ready string when it contained errors and
4614 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004615 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004616 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004617 goto onError;
4618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 Py_XDECREF(errorHandler);
4621 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004622 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004623 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_XDECREF(errorHandler);
4627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 Py_DECREF(unicode);
4629 return NULL;
4630}
4631
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004632#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004633
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004634#ifdef __APPLE__
4635
4636/* Simplified UTF-8 decoder using surrogateescape error handler,
4637 used to decode the command line arguments on Mac OS X. */
4638
4639wchar_t*
4640_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4641{
4642 int n;
4643 const char *e;
4644 wchar_t *unicode, *p;
4645
4646 /* Note: size will always be longer than the resulting Unicode
4647 character count */
4648 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4649 PyErr_NoMemory();
4650 return NULL;
4651 }
4652 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4653 if (!unicode)
4654 return NULL;
4655
4656 /* Unpack UTF-8 encoded data */
4657 p = unicode;
4658 e = s + size;
4659 while (s < e) {
4660 Py_UCS4 ch = (unsigned char)*s;
4661
4662 if (ch < 0x80) {
4663 *p++ = (wchar_t)ch;
4664 s++;
4665 continue;
4666 }
4667
4668 n = utf8_code_length[ch];
4669 if (s + n > e) {
4670 goto surrogateescape;
4671 }
4672
4673 switch (n) {
4674 case 0:
4675 case 1:
4676 goto surrogateescape;
4677
4678 case 2:
4679 if ((s[1] & 0xc0) != 0x80)
4680 goto surrogateescape;
4681 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4682 assert ((ch > 0x007F) && (ch <= 0x07FF));
4683 *p++ = (wchar_t)ch;
4684 break;
4685
4686 case 3:
4687 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4688 will result in surrogates in range d800-dfff. Surrogates are
4689 not valid UTF-8 so they are rejected.
4690 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4691 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4692 if ((s[1] & 0xc0) != 0x80 ||
4693 (s[2] & 0xc0) != 0x80 ||
4694 ((unsigned char)s[0] == 0xE0 &&
4695 (unsigned char)s[1] < 0xA0) ||
4696 ((unsigned char)s[0] == 0xED &&
4697 (unsigned char)s[1] > 0x9F)) {
4698
4699 goto surrogateescape;
4700 }
4701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4702 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004703 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004704 break;
4705
4706 case 4:
4707 if ((s[1] & 0xc0) != 0x80 ||
4708 (s[2] & 0xc0) != 0x80 ||
4709 (s[3] & 0xc0) != 0x80 ||
4710 ((unsigned char)s[0] == 0xF0 &&
4711 (unsigned char)s[1] < 0x90) ||
4712 ((unsigned char)s[0] == 0xF4 &&
4713 (unsigned char)s[1] > 0x8F)) {
4714 goto surrogateescape;
4715 }
4716 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4717 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004718 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719
4720#if SIZEOF_WCHAR_T == 4
4721 *p++ = (wchar_t)ch;
4722#else
4723 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004724 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4725 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004726#endif
4727 break;
4728 }
4729 s += n;
4730 continue;
4731
4732 surrogateescape:
4733 *p++ = 0xDC00 + ch;
4734 s++;
4735 }
4736 *p = L'\0';
4737 return unicode;
4738}
4739
4740#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004742/* Primary internal function which creates utf8 encoded bytes objects.
4743
4744 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004745 and allocate exactly as much space needed at the end. Else allocate the
4746 maximum possible needed (4 result bytes per Unicode character), and return
4747 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004748*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004749PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004750_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Tim Peters602f7402002-04-27 18:03:26 +00004752#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004753
Guido van Rossum98297ee2007-11-06 21:34:58 +00004754 Py_ssize_t i; /* index into s of next input byte */
4755 PyObject *result; /* result string object */
4756 char *p; /* next free byte in output buffer */
4757 Py_ssize_t nallocated; /* number of result bytes allocated */
4758 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004759 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004760 PyObject *errorHandler = NULL;
4761 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 int kind;
4763 void *data;
4764 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004765 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004767 if (!PyUnicode_Check(unicode)) {
4768 PyErr_BadArgument();
4769 return NULL;
4770 }
4771
4772 if (PyUnicode_READY(unicode) == -1)
4773 return NULL;
4774
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004775 if (PyUnicode_UTF8(unicode))
4776 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4777 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778
4779 kind = PyUnicode_KIND(unicode);
4780 data = PyUnicode_DATA(unicode);
4781 size = PyUnicode_GET_LENGTH(unicode);
4782
Tim Peters602f7402002-04-27 18:03:26 +00004783 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784
Tim Peters602f7402002-04-27 18:03:26 +00004785 if (size <= MAX_SHORT_UNICHARS) {
4786 /* Write into the stack buffer; nallocated can't overflow.
4787 * At the end, we'll allocate exactly as much heap space as it
4788 * turns out we need.
4789 */
4790 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004791 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004792 p = stackbuf;
4793 }
4794 else {
4795 /* Overallocate on the heap, and give the excess back at the end. */
4796 nallocated = size * 4;
4797 if (nallocated / 4 != size) /* overflow! */
4798 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004799 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004800 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004801 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004802 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004803 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004804
Tim Peters602f7402002-04-27 18:03:26 +00004805 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004806 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004807
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004808 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004809 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004813 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004814 *p++ = (char)(0xc0 | (ch >> 6));
4815 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004816 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818 Py_ssize_t repsize, k, startpos;
4819 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004820 rep = unicode_encode_call_errorhandler(
4821 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004822 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004823 if (!rep)
4824 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826 if (PyBytes_Check(rep))
4827 repsize = PyBytes_GET_SIZE(rep);
4828 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004829 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830
4831 if (repsize > 4) {
4832 Py_ssize_t offset;
4833
4834 if (result == NULL)
4835 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004836 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004837 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4840 /* integer overflow */
4841 PyErr_NoMemory();
4842 goto error;
4843 }
4844 nallocated += repsize - 4;
4845 if (result != NULL) {
4846 if (_PyBytes_Resize(&result, nallocated) < 0)
4847 goto error;
4848 } else {
4849 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851 goto error;
4852 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4853 }
4854 p = PyBytes_AS_STRING(result) + offset;
4855 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 if (PyBytes_Check(rep)) {
4858 char *prep = PyBytes_AS_STRING(rep);
4859 for(k = repsize; k > 0; k--)
4860 *p++ = *prep++;
4861 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004862 enum PyUnicode_Kind repkind;
4863 void *repdata;
4864
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004865 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004866 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004867 repkind = PyUnicode_KIND(rep);
4868 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004869
4870 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004871 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004872 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004873 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004874 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004875 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004877 goto error;
4878 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004879 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004880 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004881 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004882 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004883 } else if (ch < 0x10000) {
4884 *p++ = (char)(0xe0 | (ch >> 12));
4885 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4886 *p++ = (char)(0x80 | (ch & 0x3f));
4887 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004888 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004889 /* Encode UCS4 Unicode ordinals */
4890 *p++ = (char)(0xf0 | (ch >> 18));
4891 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4892 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4893 *p++ = (char)(0x80 | (ch & 0x3f));
4894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004896
Guido van Rossum98297ee2007-11-06 21:34:58 +00004897 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004898 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004899 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004900 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004901 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004902 }
4903 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004904 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004905 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004906 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004907 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004910 Py_XDECREF(errorHandler);
4911 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004912 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004913 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004914 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004915 Py_XDECREF(errorHandler);
4916 Py_XDECREF(exc);
4917 Py_XDECREF(result);
4918 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004919
Tim Peters602f7402002-04-27 18:03:26 +00004920#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921}
4922
Alexander Belopolsky40018472011-02-26 01:02:56 +00004923PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004924PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4925 Py_ssize_t size,
4926 const char *errors)
4927{
4928 PyObject *v, *unicode;
4929
4930 unicode = PyUnicode_FromUnicode(s, size);
4931 if (unicode == NULL)
4932 return NULL;
4933 v = _PyUnicode_AsUTF8String(unicode, errors);
4934 Py_DECREF(unicode);
4935 return v;
4936}
4937
4938PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004939PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004941 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942}
4943
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944/* --- UTF-32 Codec ------------------------------------------------------- */
4945
4946PyObject *
4947PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 Py_ssize_t size,
4949 const char *errors,
4950 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004951{
4952 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4953}
4954
4955PyObject *
4956PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 Py_ssize_t size,
4958 const char *errors,
4959 int *byteorder,
4960 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961{
4962 const char *starts = s;
4963 Py_ssize_t startinpos;
4964 Py_ssize_t endinpos;
4965 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004966 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004967 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 int bo = 0; /* assume native ordering by default */
4969 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970 /* Offsets from q for retrieving bytes in the right order. */
4971#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4972 int iorder[] = {0, 1, 2, 3};
4973#else
4974 int iorder[] = {3, 2, 1, 0};
4975#endif
4976 PyObject *errorHandler = NULL;
4977 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004978
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979 q = (unsigned char *)s;
4980 e = q + size;
4981
4982 if (byteorder)
4983 bo = *byteorder;
4984
4985 /* Check for BOM marks (U+FEFF) in the input and adjust current
4986 byte order setting accordingly. In native mode, the leading BOM
4987 mark is skipped, in all other modes, it is copied to the output
4988 stream as-is (giving a ZWNBSP character). */
4989 if (bo == 0) {
4990 if (size >= 4) {
4991 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 if (bom == 0x0000FEFF) {
4995 q += 4;
4996 bo = -1;
4997 }
4998 else if (bom == 0xFFFE0000) {
4999 q += 4;
5000 bo = 1;
5001 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 if (bom == 0x0000FEFF) {
5004 q += 4;
5005 bo = 1;
5006 }
5007 else if (bom == 0xFFFE0000) {
5008 q += 4;
5009 bo = -1;
5010 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013 }
5014
5015 if (bo == -1) {
5016 /* force LE */
5017 iorder[0] = 0;
5018 iorder[1] = 1;
5019 iorder[2] = 2;
5020 iorder[3] = 3;
5021 }
5022 else if (bo == 1) {
5023 /* force BE */
5024 iorder[0] = 3;
5025 iorder[1] = 2;
5026 iorder[2] = 1;
5027 iorder[3] = 0;
5028 }
5029
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005030 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005031 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005032 if (!unicode)
5033 return NULL;
5034 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005035 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005036 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005037
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 Py_UCS4 ch;
5040 /* remaining bytes at the end? (size should be divisible by 4) */
5041 if (e-q<4) {
5042 if (consumed)
5043 break;
5044 errmsg = "truncated data";
5045 startinpos = ((const char *)q)-starts;
5046 endinpos = ((const char *)e)-starts;
5047 goto utf32Error;
5048 /* The remaining input chars are ignored if the callback
5049 chooses to skip the input */
5050 }
5051 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5052 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 if (ch >= 0x110000)
5055 {
5056 errmsg = "codepoint not in range(0x110000)";
5057 startinpos = ((const char *)q)-starts;
5058 endinpos = startinpos+4;
5059 goto utf32Error;
5060 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005061 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5062 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 q += 4;
5064 continue;
5065 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (unicode_decode_call_errorhandler(
5067 errors, &errorHandler,
5068 "utf32", errmsg,
5069 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005070 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005071 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072 }
5073
5074 if (byteorder)
5075 *byteorder = bo;
5076
5077 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079
5080 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005081 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 goto onError;
5083
5084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005086 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089 Py_DECREF(unicode);
5090 Py_XDECREF(errorHandler);
5091 Py_XDECREF(exc);
5092 return NULL;
5093}
5094
5095PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005096_PyUnicode_EncodeUTF32(PyObject *str,
5097 const char *errors,
5098 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 int kind;
5101 void *data;
5102 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005103 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 /* Offsets from p for storing byte pairs in the right order. */
5107#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5108 int iorder[] = {0, 1, 2, 3};
5109#else
5110 int iorder[] = {3, 2, 1, 0};
5111#endif
5112
Benjamin Peterson29060642009-01-31 22:14:21 +00005113#define STORECHAR(CH) \
5114 do { \
5115 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5116 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5117 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5118 p[iorder[0]] = (CH) & 0xff; \
5119 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005120 } while(0)
5121
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005122 if (!PyUnicode_Check(str)) {
5123 PyErr_BadArgument();
5124 return NULL;
5125 }
5126 if (PyUnicode_READY(str) < 0)
5127 return NULL;
5128 kind = PyUnicode_KIND(str);
5129 data = PyUnicode_DATA(str);
5130 len = PyUnicode_GET_LENGTH(str);
5131
5132 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005133 bytesize = nsize * 4;
5134 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005136 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005137 if (v == NULL)
5138 return NULL;
5139
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005140 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005143 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005144 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145
5146 if (byteorder == -1) {
5147 /* force LE */
5148 iorder[0] = 0;
5149 iorder[1] = 1;
5150 iorder[2] = 2;
5151 iorder[3] = 3;
5152 }
5153 else if (byteorder == 1) {
5154 /* force BE */
5155 iorder[0] = 3;
5156 iorder[1] = 2;
5157 iorder[2] = 1;
5158 iorder[3] = 0;
5159 }
5160
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005161 for (i = 0; i < len; i++)
5162 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005163
5164 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005165 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005166#undef STORECHAR
5167}
5168
Alexander Belopolsky40018472011-02-26 01:02:56 +00005169PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005170PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5171 Py_ssize_t size,
5172 const char *errors,
5173 int byteorder)
5174{
5175 PyObject *result;
5176 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5177 if (tmp == NULL)
5178 return NULL;
5179 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5180 Py_DECREF(tmp);
5181 return result;
5182}
5183
5184PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005185PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005186{
Victor Stinnerb960b342011-11-20 19:12:52 +01005187 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005188}
5189
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190/* --- UTF-16 Codec ------------------------------------------------------- */
5191
Tim Peters772747b2001-08-09 22:21:55 +00005192PyObject *
5193PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 Py_ssize_t size,
5195 const char *errors,
5196 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197{
Walter Dörwald69652032004-09-07 20:24:22 +00005198 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5199}
5200
Antoine Pitrouab868312009-01-10 15:40:25 +00005201/* Two masks for fast checking of whether a C 'long' may contain
5202 UTF16-encoded surrogate characters. This is an efficient heuristic,
5203 assuming that non-surrogate characters with a code point >= 0x8000 are
5204 rare in most input.
5205 FAST_CHAR_MASK is used when the input is in native byte ordering,
5206 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005207*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005208#if (SIZEOF_LONG == 8)
5209# define FAST_CHAR_MASK 0x8000800080008000L
5210# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5211#elif (SIZEOF_LONG == 4)
5212# define FAST_CHAR_MASK 0x80008000L
5213# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5214#else
5215# error C 'long' size should be either 4 or 8!
5216#endif
5217
Walter Dörwald69652032004-09-07 20:24:22 +00005218PyObject *
5219PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder,
5223 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005224{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
5228 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005229 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005230 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005231 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005232 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005233 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005234 /* Offsets from q for retrieving byte pairs in the right order. */
5235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5236 int ihi = 1, ilo = 0;
5237#else
5238 int ihi = 0, ilo = 1;
5239#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005240 PyObject *errorHandler = NULL;
5241 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
5243 /* Note: size will always be longer than the resulting Unicode
5244 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005245 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 if (!unicode)
5247 return NULL;
5248 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005249 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005250 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Tim Peters772747b2001-08-09 22:21:55 +00005252 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005253 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
5255 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005256 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005258 /* Check for BOM marks (U+FEFF) in the input and adjust current
5259 byte order setting accordingly. In native mode, the leading BOM
5260 mark is skipped, in all other modes, it is copied to the output
5261 stream as-is (giving a ZWNBSP character). */
5262 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005263 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005264 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005265#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (bom == 0xFEFF) {
5267 q += 2;
5268 bo = -1;
5269 }
5270 else if (bom == 0xFFFE) {
5271 q += 2;
5272 bo = 1;
5273 }
Tim Petersced69f82003-09-16 20:30:58 +00005274#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 if (bom == 0xFEFF) {
5276 q += 2;
5277 bo = 1;
5278 }
5279 else if (bom == 0xFFFE) {
5280 q += 2;
5281 bo = -1;
5282 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005283#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
Tim Peters772747b2001-08-09 22:21:55 +00005287 if (bo == -1) {
5288 /* force LE */
5289 ihi = 1;
5290 ilo = 0;
5291 }
5292 else if (bo == 1) {
5293 /* force BE */
5294 ihi = 0;
5295 ilo = 1;
5296 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005297#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5298 native_ordering = ilo < ihi;
5299#else
5300 native_ordering = ilo > ihi;
5301#endif
Tim Peters772747b2001-08-09 22:21:55 +00005302
Antoine Pitrouab868312009-01-10 15:40:25 +00005303 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005304 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005305 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005306 /* First check for possible aligned read of a C 'long'. Unaligned
5307 reads are more expensive, better to defer to another iteration. */
5308 if (!((size_t) q & LONG_PTR_MASK)) {
5309 /* Fast path for runs of non-surrogate chars. */
5310 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005311 int kind = PyUnicode_KIND(unicode);
5312 void *data = PyUnicode_DATA(unicode);
5313 while (_q < aligned_end) {
5314 unsigned long block = * (unsigned long *) _q;
5315 unsigned short *pblock = (unsigned short*)&block;
5316 Py_UCS4 maxch;
5317 if (native_ordering) {
5318 /* Can use buffer directly */
5319 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005320 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005321 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005322 else {
5323 /* Need to byte-swap */
5324 unsigned char *_p = (unsigned char*)pblock;
5325 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005327 _p[0] = _q[1];
5328 _p[1] = _q[0];
5329 _p[2] = _q[3];
5330 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005331#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005332 _p[4] = _q[5];
5333 _p[5] = _q[4];
5334 _p[6] = _q[7];
5335 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005336#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005337 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 maxch = Py_MAX(pblock[0], pblock[1]);
5339#if SIZEOF_LONG == 8
5340 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5341#endif
5342 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5343 if (unicode_widen(&unicode, maxch) < 0)
5344 goto onError;
5345 kind = PyUnicode_KIND(unicode);
5346 data = PyUnicode_DATA(unicode);
5347 }
5348 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5349 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5350#if SIZEOF_LONG == 8
5351 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5352 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5353#endif
5354 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005356 q = _q;
5357 if (q >= e)
5358 break;
5359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005363
Victor Stinner551ac952011-11-29 22:58:13 +01005364 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005365 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5366 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 continue;
5368 }
5369
5370 /* UTF-16 code pair: */
5371 if (q > e) {
5372 errmsg = "unexpected end of data";
5373 startinpos = (((const char *)q) - 2) - starts;
5374 endinpos = ((const char *)e) + 1 - starts;
5375 goto utf16Error;
5376 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005377 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5378 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005380 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005381 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005382 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005383 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 continue;
5385 }
5386 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005387 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 startinpos = (((const char *)q)-4)-starts;
5389 endinpos = startinpos+2;
5390 goto utf16Error;
5391 }
5392
Benjamin Peterson14339b62009-01-31 16:36:08 +00005393 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 errmsg = "illegal encoding";
5395 startinpos = (((const char *)q)-2)-starts;
5396 endinpos = startinpos+2;
5397 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005398
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005401 errors,
5402 &errorHandler,
5403 "utf16", errmsg,
5404 &starts,
5405 (const char **)&e,
5406 &startinpos,
5407 &endinpos,
5408 &exc,
5409 (const char **)&q,
5410 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005411 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005414 /* remaining byte at the end? (size should be even) */
5415 if (e == q) {
5416 if (!consumed) {
5417 errmsg = "truncated data";
5418 startinpos = ((const char *)q) - starts;
5419 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005420 if (unicode_decode_call_errorhandler(
5421 errors,
5422 &errorHandler,
5423 "utf16", errmsg,
5424 &starts,
5425 (const char **)&e,
5426 &startinpos,
5427 &endinpos,
5428 &exc,
5429 (const char **)&q,
5430 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005431 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 goto onError;
5433 /* The remaining input chars are ignored if the callback
5434 chooses to skip the input */
5435 }
5436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
5438 if (byteorder)
5439 *byteorder = bo;
5440
Walter Dörwald69652032004-09-07 20:24:22 +00005441 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005442 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005443
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005445 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 goto onError;
5447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005448 Py_XDECREF(errorHandler);
5449 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005450 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 Py_XDECREF(errorHandler);
5455 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 return NULL;
5457}
5458
Antoine Pitrouab868312009-01-10 15:40:25 +00005459#undef FAST_CHAR_MASK
5460#undef SWAPPED_FAST_CHAR_MASK
5461
Tim Peters772747b2001-08-09 22:21:55 +00005462PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005463_PyUnicode_EncodeUTF16(PyObject *str,
5464 const char *errors,
5465 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005467 int kind;
5468 void *data;
5469 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005470 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005471 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005472 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005473 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005474 /* Offsets from p for storing byte pairs in the right order. */
5475#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5476 int ihi = 1, ilo = 0;
5477#else
5478 int ihi = 0, ilo = 1;
5479#endif
5480
Benjamin Peterson29060642009-01-31 22:14:21 +00005481#define STORECHAR(CH) \
5482 do { \
5483 p[ihi] = ((CH) >> 8) & 0xff; \
5484 p[ilo] = (CH) & 0xff; \
5485 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005486 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005488 if (!PyUnicode_Check(str)) {
5489 PyErr_BadArgument();
5490 return NULL;
5491 }
5492 if (PyUnicode_READY(str) < 0)
5493 return NULL;
5494 kind = PyUnicode_KIND(str);
5495 data = PyUnicode_DATA(str);
5496 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005497
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005498 pairs = 0;
5499 if (kind == PyUnicode_4BYTE_KIND)
5500 for (i = 0; i < len; i++)
5501 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5502 pairs++;
5503 /* 2 * (len + pairs + (byteorder == 0)) */
5504 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005506 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005507 bytesize = nsize * 2;
5508 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005510 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 if (v == NULL)
5512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005514 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005516 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005517 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005518 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005519
5520 if (byteorder == -1) {
5521 /* force LE */
5522 ihi = 1;
5523 ilo = 0;
5524 }
5525 else if (byteorder == 1) {
5526 /* force BE */
5527 ihi = 0;
5528 ilo = 1;
5529 }
5530
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531 for (i = 0; i < len; i++) {
5532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5533 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005535 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5536 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005537 }
Tim Peters772747b2001-08-09 22:21:55 +00005538 STORECHAR(ch);
5539 if (ch2)
5540 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005541 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005542
5543 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005544 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005545#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
Alexander Belopolsky40018472011-02-26 01:02:56 +00005548PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005549PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5550 Py_ssize_t size,
5551 const char *errors,
5552 int byteorder)
5553{
5554 PyObject *result;
5555 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5556 if (tmp == NULL)
5557 return NULL;
5558 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5559 Py_DECREF(tmp);
5560 return result;
5561}
5562
5563PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005564PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005566 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567}
5568
5569/* --- Unicode Escape Codec ----------------------------------------------- */
5570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005571/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5572 if all the escapes in the string make it still a valid ASCII string.
5573 Returns -1 if any escapes were found which cause the string to
5574 pop out of ASCII range. Otherwise returns the length of the
5575 required buffer to hold the string.
5576 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005577static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5579{
5580 const unsigned char *p = (const unsigned char *)s;
5581 const unsigned char *end = p + size;
5582 Py_ssize_t length = 0;
5583
5584 if (size < 0)
5585 return -1;
5586
5587 for (; p < end; ++p) {
5588 if (*p > 127) {
5589 /* Non-ASCII */
5590 return -1;
5591 }
5592 else if (*p != '\\') {
5593 /* Normal character */
5594 ++length;
5595 }
5596 else {
5597 /* Backslash-escape, check next char */
5598 ++p;
5599 /* Escape sequence reaches till end of string or
5600 non-ASCII follow-up. */
5601 if (p >= end || *p > 127)
5602 return -1;
5603 switch (*p) {
5604 case '\n':
5605 /* backslash + \n result in zero characters */
5606 break;
5607 case '\\': case '\'': case '\"':
5608 case 'b': case 'f': case 't':
5609 case 'n': case 'r': case 'v': case 'a':
5610 ++length;
5611 break;
5612 case '0': case '1': case '2': case '3':
5613 case '4': case '5': case '6': case '7':
5614 case 'x': case 'u': case 'U': case 'N':
5615 /* these do not guarantee ASCII characters */
5616 return -1;
5617 default:
5618 /* count the backslash + the other character */
5619 length += 2;
5620 }
5621 }
5622 }
5623 return length;
5624}
5625
Fredrik Lundh06d12682001-01-24 07:59:11 +00005626static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005627
Alexander Belopolsky40018472011-02-26 01:02:56 +00005628PyObject *
5629PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005630 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005631 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005634 Py_ssize_t startinpos;
5635 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005637 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 char* message;
5640 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 PyObject *errorHandler = NULL;
5642 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005644 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005646 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005647
5648 /* After length_of_escaped_ascii_string() there are two alternatives,
5649 either the string is pure ASCII with named escapes like \n, etc.
5650 and we determined it's exact size (common case)
5651 or it contains \x, \u, ... escape sequences. then we create a
5652 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005653 if (len >= 0) {
5654 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 if (!v)
5656 goto onError;
5657 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005658 }
5659 else {
5660 /* Escaped strings will always be longer than the resulting
5661 Unicode string, so we start with size here and then reduce the
5662 length after conversion to the true value.
5663 (but if the error callback returns a long replacement string
5664 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005665 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005666 if (!v)
5667 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005668 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 }
5670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005672 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005673 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 while (s < end) {
5677 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005678 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005681 /* The only case in which i == ascii_length is a backslash
5682 followed by a newline. */
5683 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 /* Non-escape characters are interpreted as Unicode ordinals */
5686 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5688 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 continue;
5690 }
5691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 /* \ - Escapes */
5694 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005695 c = *s++;
5696 if (s > end)
5697 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005698
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 /* The only case in which i == ascii_length is a backslash
5700 followed by a newline. */
5701 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005702
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005703 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704
Benjamin Peterson29060642009-01-31 22:14:21 +00005705 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005706#define WRITECHAR(ch) \
5707 do { \
5708 if (unicode_putchar(&v, &i, ch) < 0) \
5709 goto onError; \
5710 }while(0)
5711
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 case '\\': WRITECHAR('\\'); break;
5714 case '\'': WRITECHAR('\''); break;
5715 case '\"': WRITECHAR('\"'); break;
5716 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718 case 'f': WRITECHAR('\014'); break;
5719 case 't': WRITECHAR('\t'); break;
5720 case 'n': WRITECHAR('\n'); break;
5721 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005723 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005724 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005725 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case '0': case '1': case '2': case '3':
5729 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005730 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005731 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005732 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005733 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005734 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 break;
5738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* hex escapes */
5740 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005742 digits = 2;
5743 message = "truncated \\xXX escape";
5744 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005748 digits = 4;
5749 message = "truncated \\uXXXX escape";
5750 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005753 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 digits = 8;
5755 message = "truncated \\UXXXXXXXX escape";
5756 hexescape:
5757 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 if (s+digits>end) {
5759 endinpos = size;
5760 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 errors, &errorHandler,
5762 "unicodeescape", "end of string in escape sequence",
5763 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005764 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005765 goto onError;
5766 goto nextByte;
5767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 for (j = 0; j < digits; ++j) {
5769 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005770 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005771 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005772 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 errors, &errorHandler,
5774 "unicodeescape", message,
5775 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005776 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005777 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005778 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005780 }
5781 chr = (chr<<4) & ~0xF;
5782 if (c >= '0' && c <= '9')
5783 chr += c - '0';
5784 else if (c >= 'a' && c <= 'f')
5785 chr += 10 + c - 'a';
5786 else
5787 chr += 10 + c - 'A';
5788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005789 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005790 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 /* _decoding_error will have already written into the
5792 target buffer. */
5793 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005794 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005795 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005796 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005797 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005798 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005799 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 errors, &errorHandler,
5802 "unicodeescape", "illegal Unicode character",
5803 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005804 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 goto onError;
5806 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 break;
5808
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005810 case 'N':
5811 message = "malformed \\N character escape";
5812 if (ucnhash_CAPI == NULL) {
5813 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005814 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5815 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 if (ucnhash_CAPI == NULL)
5817 goto ucnhashError;
5818 }
5819 if (*s == '{') {
5820 const char *start = s+1;
5821 /* look for the closing brace */
5822 while (*s != '}' && s < end)
5823 s++;
5824 if (s > start && s < end && *s == '}') {
5825 /* found a name. look it up in the unicode database */
5826 message = "unknown Unicode character name";
5827 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005829 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 goto store;
5831 }
5832 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005833 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "unicodeescape", message,
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005838 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005840 break;
5841
5842 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005843 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 message = "\\ at end of string";
5845 s--;
5846 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 errors, &errorHandler,
5849 "unicodeescape", message,
5850 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005852 goto onError;
5853 }
5854 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005855 WRITECHAR('\\');
5856 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005857 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005858 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005863#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005864
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005865 if (PyUnicode_Resize(&v, i) < 0)
5866 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005869 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005870
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005872 PyErr_SetString(
5873 PyExc_UnicodeError,
5874 "\\N escapes not supported (can't load unicodedata module)"
5875 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005876 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 Py_XDECREF(errorHandler);
5878 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005879 return NULL;
5880
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 Py_XDECREF(errorHandler);
5884 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885 return NULL;
5886}
5887
5888/* Return a Unicode-Escape string version of the Unicode object.
5889
5890 If quotes is true, the string is enclosed in u"" or u'' quotes as
5891 appropriate.
5892
5893*/
5894
Alexander Belopolsky40018472011-02-26 01:02:56 +00005895PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005899 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 int kind;
5902 void *data;
5903 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904
Thomas Wouters89f507f2006-12-13 04:49:30 +00005905 /* Initial allocation is based on the longest-possible unichr
5906 escape.
5907
5908 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5909 unichr, so in this case it's the longest unichr escape. In
5910 narrow (UTF-16) builds this is five chars per source unichr
5911 since there are two unichrs in the surrogate pair, so in narrow
5912 (UTF-16) builds it's not the longest unichr escape.
5913
5914 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5915 so in the narrow (UTF-16) build case it's the longest unichr
5916 escape.
5917 */
5918
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 if (!PyUnicode_Check(unicode)) {
5920 PyErr_BadArgument();
5921 return NULL;
5922 }
5923 if (PyUnicode_READY(unicode) < 0)
5924 return NULL;
5925 len = PyUnicode_GET_LENGTH(unicode);
5926 kind = PyUnicode_KIND(unicode);
5927 data = PyUnicode_DATA(unicode);
5928 switch(kind) {
5929 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5930 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5931 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5932 }
5933
5934 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005935 return PyBytes_FromStringAndSize(NULL, 0);
5936
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005939
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005940 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005942 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (repr == NULL)
5945 return NULL;
5946
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005950 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005951
Walter Dörwald79e913e2007-05-12 11:08:06 +00005952 /* Escape backslashes */
5953 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 *p++ = '\\';
5955 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005956 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005957 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005958
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005959 /* Map 21-bit characters to '\U00xxxxxx' */
5960 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005961 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005962 *p++ = '\\';
5963 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005964 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5965 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5966 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5967 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5968 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5969 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5970 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5971 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005973 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005974
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005976 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 *p++ = '\\';
5978 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005979 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5980 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5981 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5982 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005984
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005985 /* Map special whitespace to '\t', \n', '\r' */
5986 else if (ch == '\t') {
5987 *p++ = '\\';
5988 *p++ = 't';
5989 }
5990 else if (ch == '\n') {
5991 *p++ = '\\';
5992 *p++ = 'n';
5993 }
5994 else if (ch == '\r') {
5995 *p++ = '\\';
5996 *p++ = 'r';
5997 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005998
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005999 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006000 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006002 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006003 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6004 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006005 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 /* Copy everything else as-is */
6008 else
6009 *p++ = (char) ch;
6010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006012 assert(p - PyBytes_AS_STRING(repr) > 0);
6013 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6014 return NULL;
6015 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
Alexander Belopolsky40018472011-02-26 01:02:56 +00006018PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6020 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006022 PyObject *result;
6023 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6024 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006026 result = PyUnicode_AsUnicodeEscapeString(tmp);
6027 Py_DECREF(tmp);
6028 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029}
6030
6031/* --- Raw Unicode Escape Codec ------------------------------------------- */
6032
Alexander Belopolsky40018472011-02-26 01:02:56 +00006033PyObject *
6034PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006035 Py_ssize_t size,
6036 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006039 Py_ssize_t startinpos;
6040 Py_ssize_t endinpos;
6041 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006042 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 const char *end;
6044 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 PyObject *errorHandler = NULL;
6046 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 /* Escaped strings will always be longer than the resulting
6049 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 length after conversion to the true value. (But decoding error
6051 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006052 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006056 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006057 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 end = s + size;
6059 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 unsigned char c;
6061 Py_UCS4 x;
6062 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006063 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 /* Non-escape characters are interpreted as Unicode ordinals */
6066 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006067 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006070 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 startinpos = s-starts;
6072
6073 /* \u-escapes are only interpreted iff the number of leading
6074 backslashes if odd */
6075 bs = s;
6076 for (;s < end;) {
6077 if (*s != '\\')
6078 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006079 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 }
6082 if (((s - bs) & 1) == 0 ||
6083 s >= end ||
6084 (*s != 'u' && *s != 'U')) {
6085 continue;
6086 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 count = *s=='u' ? 4 : 8;
6089 s++;
6090
6091 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 for (x = 0, i = 0; i < count; ++i, ++s) {
6093 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006094 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 endinpos = s-starts;
6096 if (unicode_decode_call_errorhandler(
6097 errors, &errorHandler,
6098 "rawunicodeescape", "truncated \\uXXXX",
6099 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006100 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 goto onError;
6102 goto nextByte;
6103 }
6104 x = (x<<4) & ~0xF;
6105 if (c >= '0' && c <= '9')
6106 x += c - '0';
6107 else if (c >= 'a' && c <= 'f')
6108 x += 10 + c - 'a';
6109 else
6110 x += 10 + c - 'A';
6111 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006112 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006113 if (unicode_putchar(&v, &outpos, x) < 0)
6114 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006115 } else {
6116 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006117 if (unicode_decode_call_errorhandler(
6118 errors, &errorHandler,
6119 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006121 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006123 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 nextByte:
6125 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 Py_XDECREF(errorHandler);
6130 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006131 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006132
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 return NULL;
6138}
6139
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006144 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 char *p;
6146 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006147 Py_ssize_t expandsize, pos;
6148 int kind;
6149 void *data;
6150 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 if (!PyUnicode_Check(unicode)) {
6153 PyErr_BadArgument();
6154 return NULL;
6155 }
6156 if (PyUnicode_READY(unicode) < 0)
6157 return NULL;
6158 kind = PyUnicode_KIND(unicode);
6159 data = PyUnicode_DATA(unicode);
6160 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006161 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6162 bytes, and 1 byte characters 4. */
6163 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006167
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 if (repr == NULL)
6170 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006172 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006174 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 for (pos = 0; pos < len; pos++) {
6176 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* Map 32-bit characters to '\Uxxxxxxxx' */
6178 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006179 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006180 *p++ = '\\';
6181 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006182 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6183 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6184 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6185 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6186 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6189 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 *p++ = '\\';
6194 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006195 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6198 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* Copy everything else as-is */
6201 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 *p++ = (char) ch;
6203 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006204
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 assert(p > q);
6206 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006207 return NULL;
6208 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209}
6210
Alexander Belopolsky40018472011-02-26 01:02:56 +00006211PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006212PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6213 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 PyObject *result;
6216 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6217 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006218 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006219 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6220 Py_DECREF(tmp);
6221 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222}
6223
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006224/* --- Unicode Internal Codec ------------------------------------------- */
6225
Alexander Belopolsky40018472011-02-26 01:02:56 +00006226PyObject *
6227_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006228 Py_ssize_t size,
6229 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006230{
6231 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006232 Py_ssize_t startinpos;
6233 Py_ssize_t endinpos;
6234 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006235 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006236 const char *end;
6237 const char *reason;
6238 PyObject *errorHandler = NULL;
6239 PyObject *exc = NULL;
6240
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006241 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006242 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006243 1))
6244 return NULL;
6245
Thomas Wouters89f507f2006-12-13 04:49:30 +00006246 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006247 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006248 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006250 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006251 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006252 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253 end = s + size;
6254
6255 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006256 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006257 Py_UCS4 ch;
6258 /* We copy the raw representation one byte at a time because the
6259 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006260 ((char *) &uch)[0] = s[0];
6261 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006262#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006263 ((char *) &uch)[2] = s[2];
6264 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006265#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006266 ch = uch;
6267
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 /* We have to sanity check the raw data, otherwise doom looms for
6269 some malformed UCS-4 data. */
6270 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006271#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006272 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006273#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006274 end-s < Py_UNICODE_SIZE
6275 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 startinpos = s - starts;
6278 if (end-s < Py_UNICODE_SIZE) {
6279 endinpos = end-starts;
6280 reason = "truncated input";
6281 }
6282 else {
6283 endinpos = s - starts + Py_UNICODE_SIZE;
6284 reason = "illegal code point (> 0x10FFFF)";
6285 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 if (unicode_decode_call_errorhandler(
6287 errors, &errorHandler,
6288 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006289 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006290 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006292 continue;
6293 }
6294
6295 s += Py_UNICODE_SIZE;
6296#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006297 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006298 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006299 Py_UNICODE uch2;
6300 ((char *) &uch2)[0] = s[0];
6301 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006302 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006303 {
Victor Stinner551ac952011-11-29 22:58:13 +01006304 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006305 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006306 }
6307 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006308#endif
6309
6310 if (unicode_putchar(&v, &outpos, ch) < 0)
6311 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006312 }
6313
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006314 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006315 goto onError;
6316 Py_XDECREF(errorHandler);
6317 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006318 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319
Benjamin Peterson29060642009-01-31 22:14:21 +00006320 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 Py_XDECREF(v);
6322 Py_XDECREF(errorHandler);
6323 Py_XDECREF(exc);
6324 return NULL;
6325}
6326
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327/* --- Latin-1 Codec ------------------------------------------------------ */
6328
Alexander Belopolsky40018472011-02-26 01:02:56 +00006329PyObject *
6330PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006331 Py_ssize_t size,
6332 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006333{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006334 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006335 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336}
6337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339static void
6340make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006342 PyObject *unicode,
6343 Py_ssize_t startpos, Py_ssize_t endpos,
6344 const char *reason)
6345{
6346 if (*exceptionObject == NULL) {
6347 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006348 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006349 encoding, unicode, startpos, endpos, reason);
6350 }
6351 else {
6352 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6353 goto onError;
6354 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6355 goto onError;
6356 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6357 goto onError;
6358 return;
6359 onError:
6360 Py_DECREF(*exceptionObject);
6361 *exceptionObject = NULL;
6362 }
6363}
6364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006366static void
6367raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006368 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006369 PyObject *unicode,
6370 Py_ssize_t startpos, Py_ssize_t endpos,
6371 const char *reason)
6372{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006373 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006374 encoding, unicode, startpos, endpos, reason);
6375 if (*exceptionObject != NULL)
6376 PyCodec_StrictErrors(*exceptionObject);
6377}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378
6379/* error handling callback helper:
6380 build arguments, call the callback and check the arguments,
6381 put the result into newpos and return the replacement string, which
6382 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006383static PyObject *
6384unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006385 PyObject **errorHandler,
6386 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006387 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006388 Py_ssize_t startpos, Py_ssize_t endpos,
6389 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 PyObject *restuple;
6394 PyObject *resunicode;
6395
6396 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 }
6401
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 if (PyUnicode_READY(unicode) < 0)
6403 return NULL;
6404 len = PyUnicode_GET_LENGTH(unicode);
6405
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006406 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006407 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410
6411 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006416 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 Py_DECREF(restuple);
6418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006420 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 &resunicode, newpos)) {
6422 Py_DECREF(restuple);
6423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006425 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6426 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6427 Py_DECREF(restuple);
6428 return NULL;
6429 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006431 *newpos = len + *newpos;
6432 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6434 Py_DECREF(restuple);
6435 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006436 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 Py_INCREF(resunicode);
6438 Py_DECREF(restuple);
6439 return resunicode;
6440}
6441
Alexander Belopolsky40018472011-02-26 01:02:56 +00006442static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006443unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006444 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006445 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 /* input state */
6448 Py_ssize_t pos=0, size;
6449 int kind;
6450 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 /* output object */
6452 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006453 /* pointer into the output */
6454 char *str;
6455 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006456 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006457 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6458 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 PyObject *errorHandler = NULL;
6460 PyObject *exc = NULL;
6461 /* the following variable is used for caching string comparisons
6462 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6463 int known_errorHandler = -1;
6464
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 if (PyUnicode_READY(unicode) < 0)
6466 return NULL;
6467 size = PyUnicode_GET_LENGTH(unicode);
6468 kind = PyUnicode_KIND(unicode);
6469 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 /* allocate enough for a simple encoding without
6471 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006472 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006473 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006474 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006476 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006477 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 ressize = size;
6479
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006480 while (pos < size) {
6481 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* can we encode this? */
6484 if (c<limit) {
6485 /* no overflow check, because we know that the space is enough */
6486 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 Py_ssize_t requiredsize;
6491 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006492 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 Py_ssize_t collstart = pos;
6495 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 ++collend;
6499 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6500 if (known_errorHandler==-1) {
6501 if ((errors==NULL) || (!strcmp(errors, "strict")))
6502 known_errorHandler = 1;
6503 else if (!strcmp(errors, "replace"))
6504 known_errorHandler = 2;
6505 else if (!strcmp(errors, "ignore"))
6506 known_errorHandler = 3;
6507 else if (!strcmp(errors, "xmlcharrefreplace"))
6508 known_errorHandler = 4;
6509 else
6510 known_errorHandler = 0;
6511 }
6512 switch (known_errorHandler) {
6513 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006514 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 goto onError;
6516 case 2: /* replace */
6517 while (collstart++<collend)
6518 *str++ = '?'; /* fall through */
6519 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 break;
6522 case 4: /* xmlcharrefreplace */
6523 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 /* determine replacement size */
6525 for (i = collstart, repsize = 0; i < collend; ++i) {
6526 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6527 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006529 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006531 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006535 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006539 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006540 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 if (requiredsize > ressize) {
6546 if (requiredsize<2*ressize)
6547 requiredsize = 2*ressize;
6548 if (_PyBytes_Resize(&res, requiredsize))
6549 goto onError;
6550 str = PyBytes_AS_STRING(res) + respos;
6551 ressize = requiredsize;
6552 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 /* generate replacement */
6554 for (i = collstart; i < collend; ++i) {
6555 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 break;
6559 default:
6560 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 encoding, reason, unicode, &exc,
6562 collstart, collend, &newpos);
6563 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6564 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006566 if (PyBytes_Check(repunicode)) {
6567 /* Directly copy bytes result to output. */
6568 repsize = PyBytes_Size(repunicode);
6569 if (repsize > 1) {
6570 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006571 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006572 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6573 Py_DECREF(repunicode);
6574 goto onError;
6575 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006576 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006577 ressize += repsize-1;
6578 }
6579 memcpy(str, PyBytes_AsString(repunicode), repsize);
6580 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006581 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006582 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006584 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 /* need more space? (at least enough for what we
6586 have+the replacement+the rest of the string, so
6587 we won't have to check space for encodable characters) */
6588 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006589 repsize = PyUnicode_GET_LENGTH(repunicode);
6590 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 if (requiredsize > ressize) {
6592 if (requiredsize<2*ressize)
6593 requiredsize = 2*ressize;
6594 if (_PyBytes_Resize(&res, requiredsize)) {
6595 Py_DECREF(repunicode);
6596 goto onError;
6597 }
6598 str = PyBytes_AS_STRING(res) + respos;
6599 ressize = requiredsize;
6600 }
6601 /* check if there is anything unencodable in the replacement
6602 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006603 for (i = 0; repsize-->0; ++i, ++str) {
6604 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006606 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006607 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 Py_DECREF(repunicode);
6609 goto onError;
6610 }
6611 *str = (char)c;
6612 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006613 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006614 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 }
6617 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006618 /* Resize if we allocated to much */
6619 size = str - PyBytes_AS_STRING(res);
6620 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006621 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006622 if (_PyBytes_Resize(&res, size) < 0)
6623 goto onError;
6624 }
6625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626 Py_XDECREF(errorHandler);
6627 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006628 return res;
6629
6630 onError:
6631 Py_XDECREF(res);
6632 Py_XDECREF(errorHandler);
6633 Py_XDECREF(exc);
6634 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635}
6636
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638PyObject *
6639PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006640 Py_ssize_t size,
6641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643 PyObject *result;
6644 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6645 if (unicode == NULL)
6646 return NULL;
6647 result = unicode_encode_ucs1(unicode, errors, 256);
6648 Py_DECREF(unicode);
6649 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650}
6651
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006653_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
6655 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006656 PyErr_BadArgument();
6657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659 if (PyUnicode_READY(unicode) == -1)
6660 return NULL;
6661 /* Fast path: if it is a one-byte string, construct
6662 bytes object directly. */
6663 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6664 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6665 PyUnicode_GET_LENGTH(unicode));
6666 /* Non-Latin-1 characters present. Defer to above function to
6667 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006668 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669}
6670
6671PyObject*
6672PyUnicode_AsLatin1String(PyObject *unicode)
6673{
6674 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675}
6676
6677/* --- 7-bit ASCII Codec -------------------------------------------------- */
6678
Alexander Belopolsky40018472011-02-26 01:02:56 +00006679PyObject *
6680PyUnicode_DecodeASCII(const char *s,
6681 Py_ssize_t size,
6682 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006685 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006686 int kind;
6687 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t startinpos;
6689 Py_ssize_t endinpos;
6690 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006692 int has_error;
6693 const unsigned char *p = (const unsigned char *)s;
6694 const unsigned char *end = p + size;
6695 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 PyObject *errorHandler = NULL;
6697 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006698
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006699 if (size == 0) {
6700 Py_INCREF(unicode_empty);
6701 return unicode_empty;
6702 }
6703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner702c7342011-10-05 13:50:52 +02006708 has_error = 0;
6709 while (p < end && !has_error) {
6710 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6711 an explanation. */
6712 if (!((size_t) p & LONG_PTR_MASK)) {
6713 /* Help register allocation */
6714 register const unsigned char *_p = p;
6715 while (_p < aligned_end) {
6716 unsigned long value = *(unsigned long *) _p;
6717 if (value & ASCII_CHAR_MASK) {
6718 has_error = 1;
6719 break;
6720 }
6721 _p += SIZEOF_LONG;
6722 }
6723 if (_p == end)
6724 break;
6725 if (has_error)
6726 break;
6727 p = _p;
6728 }
6729 if (*p & 0x80) {
6730 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006731 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006732 }
6733 else {
6734 ++p;
6735 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006736 }
Victor Stinner702c7342011-10-05 13:50:52 +02006737 if (!has_error)
6738 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006739
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006740 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006744 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006745 kind = PyUnicode_KIND(v);
6746 data = PyUnicode_DATA(v);
6747 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 e = s + size;
6749 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 register unsigned char c = (unsigned char)*s;
6751 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006752 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 ++s;
6754 }
6755 else {
6756 startinpos = s-starts;
6757 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 if (unicode_decode_call_errorhandler(
6759 errors, &errorHandler,
6760 "ascii", "ordinal not in range(128)",
6761 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006762 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006764 kind = PyUnicode_KIND(v);
6765 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 if (PyUnicode_Resize(&v, outpos) < 0)
6769 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 Py_XDECREF(errorHandler);
6771 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006772 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006773 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 Py_XDECREF(errorHandler);
6778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 return NULL;
6780}
6781
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006783PyObject *
6784PyUnicode_EncodeASCII(const Py_UNICODE *p,
6785 Py_ssize_t size,
6786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 PyObject *result;
6789 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6790 if (unicode == NULL)
6791 return NULL;
6792 result = unicode_encode_ucs1(unicode, errors, 128);
6793 Py_DECREF(unicode);
6794 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
Alexander Belopolsky40018472011-02-26 01:02:56 +00006797PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006798_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799{
6800 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 PyErr_BadArgument();
6802 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006804 if (PyUnicode_READY(unicode) == -1)
6805 return NULL;
6806 /* Fast path: if it is an ASCII-only string, construct bytes object
6807 directly. Else defer to above function to raise the exception. */
6808 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6809 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6810 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006812}
6813
6814PyObject *
6815PyUnicode_AsASCIIString(PyObject *unicode)
6816{
6817 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006818}
6819
Victor Stinner99b95382011-07-04 14:23:54 +02006820#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006821
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006822/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006823
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006824#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825#define NEED_RETRY
6826#endif
6827
Victor Stinner3a50e702011-10-18 21:21:00 +02006828#ifndef WC_ERR_INVALID_CHARS
6829# define WC_ERR_INVALID_CHARS 0x0080
6830#endif
6831
6832static char*
6833code_page_name(UINT code_page, PyObject **obj)
6834{
6835 *obj = NULL;
6836 if (code_page == CP_ACP)
6837 return "mbcs";
6838 if (code_page == CP_UTF7)
6839 return "CP_UTF7";
6840 if (code_page == CP_UTF8)
6841 return "CP_UTF8";
6842
6843 *obj = PyBytes_FromFormat("cp%u", code_page);
6844 if (*obj == NULL)
6845 return NULL;
6846 return PyBytes_AS_STRING(*obj);
6847}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848
Alexander Belopolsky40018472011-02-26 01:02:56 +00006849static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006850is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006851{
6852 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006853 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 if (!IsDBCSLeadByteEx(code_page, *curr))
6856 return 0;
6857
6858 prev = CharPrevExA(code_page, s, curr, 0);
6859 if (prev == curr)
6860 return 1;
6861 /* FIXME: This code is limited to "true" double-byte encodings,
6862 as it assumes an incomplete character consists of a single
6863 byte. */
6864 if (curr - prev == 2)
6865 return 1;
6866 if (!IsDBCSLeadByteEx(code_page, *prev))
6867 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 return 0;
6869}
6870
Victor Stinner3a50e702011-10-18 21:21:00 +02006871static DWORD
6872decode_code_page_flags(UINT code_page)
6873{
6874 if (code_page == CP_UTF7) {
6875 /* The CP_UTF7 decoder only supports flags=0 */
6876 return 0;
6877 }
6878 else
6879 return MB_ERR_INVALID_CHARS;
6880}
6881
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 * Decode a byte string from a Windows code page into unicode object in strict
6884 * mode.
6885 *
6886 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6887 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006889static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006890decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006891 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 const char *in,
6893 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006894{
Victor Stinner3a50e702011-10-18 21:21:00 +02006895 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006896 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006897 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898
6899 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 assert(insize > 0);
6901 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6902 if (outsize <= 0)
6903 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904
6905 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006907 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 if (*v == NULL)
6909 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 }
6912 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006913 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006915 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006918 }
6919
6920 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6922 if (outsize <= 0)
6923 goto error;
6924 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006925
Victor Stinner3a50e702011-10-18 21:21:00 +02006926error:
6927 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6928 return -2;
6929 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006930 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931}
6932
Victor Stinner3a50e702011-10-18 21:21:00 +02006933/*
6934 * Decode a byte string from a code page into unicode object with an error
6935 * handler.
6936 *
6937 * Returns consumed size if succeed, or raise a WindowsError or
6938 * UnicodeDecodeError exception and returns -1 on error.
6939 */
6940static int
6941decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006942 PyObject **v,
6943 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 const char *errors)
6945{
6946 const char *startin = in;
6947 const char *endin = in + size;
6948 const DWORD flags = decode_code_page_flags(code_page);
6949 /* Ideally, we should get reason from FormatMessage. This is the Windows
6950 2000 English version of the message. */
6951 const char *reason = "No mapping for the Unicode character exists "
6952 "in the target code page.";
6953 /* each step cannot decode more than 1 character, but a character can be
6954 represented as a surrogate pair */
6955 wchar_t buffer[2], *startout, *out;
6956 int insize, outsize;
6957 PyObject *errorHandler = NULL;
6958 PyObject *exc = NULL;
6959 PyObject *encoding_obj = NULL;
6960 char *encoding;
6961 DWORD err;
6962 int ret = -1;
6963
6964 assert(size > 0);
6965
6966 encoding = code_page_name(code_page, &encoding_obj);
6967 if (encoding == NULL)
6968 return -1;
6969
6970 if (errors == NULL || strcmp(errors, "strict") == 0) {
6971 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6972 UnicodeDecodeError. */
6973 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6974 if (exc != NULL) {
6975 PyCodec_StrictErrors(exc);
6976 Py_CLEAR(exc);
6977 }
6978 goto error;
6979 }
6980
6981 if (*v == NULL) {
6982 /* Create unicode object */
6983 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6984 PyErr_NoMemory();
6985 goto error;
6986 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006987 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 if (*v == NULL)
6989 goto error;
6990 startout = PyUnicode_AS_UNICODE(*v);
6991 }
6992 else {
6993 /* Extend unicode object */
6994 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6995 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6996 PyErr_NoMemory();
6997 goto error;
6998 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006999 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 goto error;
7001 startout = PyUnicode_AS_UNICODE(*v) + n;
7002 }
7003
7004 /* Decode the byte string character per character */
7005 out = startout;
7006 while (in < endin)
7007 {
7008 /* Decode a character */
7009 insize = 1;
7010 do
7011 {
7012 outsize = MultiByteToWideChar(code_page, flags,
7013 in, insize,
7014 buffer, Py_ARRAY_LENGTH(buffer));
7015 if (outsize > 0)
7016 break;
7017 err = GetLastError();
7018 if (err != ERROR_NO_UNICODE_TRANSLATION
7019 && err != ERROR_INSUFFICIENT_BUFFER)
7020 {
7021 PyErr_SetFromWindowsErr(0);
7022 goto error;
7023 }
7024 insize++;
7025 }
7026 /* 4=maximum length of a UTF-8 sequence */
7027 while (insize <= 4 && (in + insize) <= endin);
7028
7029 if (outsize <= 0) {
7030 Py_ssize_t startinpos, endinpos, outpos;
7031
7032 startinpos = in - startin;
7033 endinpos = startinpos + 1;
7034 outpos = out - PyUnicode_AS_UNICODE(*v);
7035 if (unicode_decode_call_errorhandler(
7036 errors, &errorHandler,
7037 encoding, reason,
7038 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007039 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007040 {
7041 goto error;
7042 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007043 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007044 }
7045 else {
7046 in += insize;
7047 memcpy(out, buffer, outsize * sizeof(wchar_t));
7048 out += outsize;
7049 }
7050 }
7051
7052 /* write a NUL character at the end */
7053 *out = 0;
7054
7055 /* Extend unicode object */
7056 outsize = out - startout;
7057 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007059 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007060 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007061
7062error:
7063 Py_XDECREF(encoding_obj);
7064 Py_XDECREF(errorHandler);
7065 Py_XDECREF(exc);
7066 return ret;
7067}
7068
Victor Stinner3a50e702011-10-18 21:21:00 +02007069static PyObject *
7070decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007071 const char *s, Py_ssize_t size,
7072 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073{
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 PyObject *v = NULL;
7075 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077 if (code_page < 0) {
7078 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7079 return NULL;
7080 }
7081
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084
Victor Stinner76a31a62011-11-04 00:05:13 +01007085 do
7086 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007087#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007088 if (size > INT_MAX) {
7089 chunk_size = INT_MAX;
7090 final = 0;
7091 done = 0;
7092 }
7093 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007094#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007095 {
7096 chunk_size = (int)size;
7097 final = (consumed == NULL);
7098 done = 1;
7099 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 /* Skip trailing lead-byte unless 'final' is set */
7102 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7103 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104
Victor Stinner76a31a62011-11-04 00:05:13 +01007105 if (chunk_size == 0 && done) {
7106 if (v != NULL)
7107 break;
7108 Py_INCREF(unicode_empty);
7109 return unicode_empty;
7110 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111
Victor Stinner76a31a62011-11-04 00:05:13 +01007112
7113 converted = decode_code_page_strict(code_page, &v,
7114 s, chunk_size);
7115 if (converted == -2)
7116 converted = decode_code_page_errors(code_page, &v,
7117 s, chunk_size,
7118 errors);
7119 assert(converted != 0);
7120
7121 if (converted < 0) {
7122 Py_XDECREF(v);
7123 return NULL;
7124 }
7125
7126 if (consumed)
7127 *consumed += converted;
7128
7129 s += converted;
7130 size -= converted;
7131 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007132
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007133 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134}
7135
Alexander Belopolsky40018472011-02-26 01:02:56 +00007136PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007137PyUnicode_DecodeCodePageStateful(int code_page,
7138 const char *s,
7139 Py_ssize_t size,
7140 const char *errors,
7141 Py_ssize_t *consumed)
7142{
7143 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7144}
7145
7146PyObject *
7147PyUnicode_DecodeMBCSStateful(const char *s,
7148 Py_ssize_t size,
7149 const char *errors,
7150 Py_ssize_t *consumed)
7151{
7152 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7153}
7154
7155PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156PyUnicode_DecodeMBCS(const char *s,
7157 Py_ssize_t size,
7158 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007159{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7161}
7162
Victor Stinner3a50e702011-10-18 21:21:00 +02007163static DWORD
7164encode_code_page_flags(UINT code_page, const char *errors)
7165{
7166 if (code_page == CP_UTF8) {
7167 if (winver.dwMajorVersion >= 6)
7168 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7169 and later */
7170 return WC_ERR_INVALID_CHARS;
7171 else
7172 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7173 return 0;
7174 }
7175 else if (code_page == CP_UTF7) {
7176 /* CP_UTF7 only supports flags=0 */
7177 return 0;
7178 }
7179 else {
7180 if (errors != NULL && strcmp(errors, "replace") == 0)
7181 return 0;
7182 else
7183 return WC_NO_BEST_FIT_CHARS;
7184 }
7185}
7186
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 * Encode a Unicode string to a Windows code page into a byte string in strict
7189 * mode.
7190 *
7191 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7192 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007194static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007195encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007196 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007198{
Victor Stinner554f3f02010-06-16 23:33:54 +00007199 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 BOOL *pusedDefaultChar = &usedDefaultChar;
7201 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007202 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007203 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007204 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 const DWORD flags = encode_code_page_flags(code_page, NULL);
7206 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007207 /* Create a substring so that we can get the UTF-16 representation
7208 of just the slice under consideration. */
7209 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007212
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007214 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007216 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007217
Victor Stinner2fc507f2011-11-04 20:06:39 +01007218 substring = PyUnicode_Substring(unicode, offset, offset+len);
7219 if (substring == NULL)
7220 return -1;
7221 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7222 if (p == NULL) {
7223 Py_DECREF(substring);
7224 return -1;
7225 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007226
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007227 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 outsize = WideCharToMultiByte(code_page, flags,
7229 p, size,
7230 NULL, 0,
7231 NULL, pusedDefaultChar);
7232 if (outsize <= 0)
7233 goto error;
7234 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007235 if (pusedDefaultChar && *pusedDefaultChar) {
7236 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007238 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007239
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007243 if (*outbytes == NULL) {
7244 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248 }
7249 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 const Py_ssize_t n = PyBytes_Size(*outbytes);
7252 if (outsize > PY_SSIZE_T_MAX - n) {
7253 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007254 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007257 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7258 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007260 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007262 }
7263
7264 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 outsize = WideCharToMultiByte(code_page, flags,
7266 p, size,
7267 out, outsize,
7268 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 if (outsize <= 0)
7271 goto error;
7272 if (pusedDefaultChar && *pusedDefaultChar)
7273 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007275
Victor Stinner3a50e702011-10-18 21:21:00 +02007276error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007277 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7279 return -2;
7280 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007281 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007282}
7283
Victor Stinner3a50e702011-10-18 21:21:00 +02007284/*
7285 * Encode a Unicode string to a Windows code page into a byte string using a
7286 * error handler.
7287 *
7288 * Returns consumed characters if succeed, or raise a WindowsError and returns
7289 * -1 on other error.
7290 */
7291static int
7292encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007293 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007294 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007295{
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007297 Py_ssize_t pos = unicode_offset;
7298 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 /* Ideally, we should get reason from FormatMessage. This is the Windows
7300 2000 English version of the message. */
7301 const char *reason = "invalid character";
7302 /* 4=maximum length of a UTF-8 sequence */
7303 char buffer[4];
7304 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7305 Py_ssize_t outsize;
7306 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 PyObject *errorHandler = NULL;
7308 PyObject *exc = NULL;
7309 PyObject *encoding_obj = NULL;
7310 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 PyObject *rep;
7313 int ret = -1;
7314
7315 assert(insize > 0);
7316
7317 encoding = code_page_name(code_page, &encoding_obj);
7318 if (encoding == NULL)
7319 return -1;
7320
7321 if (errors == NULL || strcmp(errors, "strict") == 0) {
7322 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7323 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007324 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 if (exc != NULL) {
7326 PyCodec_StrictErrors(exc);
7327 Py_DECREF(exc);
7328 }
7329 Py_XDECREF(encoding_obj);
7330 return -1;
7331 }
7332
7333 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7334 pusedDefaultChar = &usedDefaultChar;
7335 else
7336 pusedDefaultChar = NULL;
7337
7338 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7339 PyErr_NoMemory();
7340 goto error;
7341 }
7342 outsize = insize * Py_ARRAY_LENGTH(buffer);
7343
7344 if (*outbytes == NULL) {
7345 /* Create string object */
7346 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7347 if (*outbytes == NULL)
7348 goto error;
7349 out = PyBytes_AS_STRING(*outbytes);
7350 }
7351 else {
7352 /* Extend string object */
7353 Py_ssize_t n = PyBytes_Size(*outbytes);
7354 if (n > PY_SSIZE_T_MAX - outsize) {
7355 PyErr_NoMemory();
7356 goto error;
7357 }
7358 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7359 goto error;
7360 out = PyBytes_AS_STRING(*outbytes) + n;
7361 }
7362
7363 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007364 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007366 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7367 wchar_t chars[2];
7368 int charsize;
7369 if (ch < 0x10000) {
7370 chars[0] = (wchar_t)ch;
7371 charsize = 1;
7372 }
7373 else {
7374 ch -= 0x10000;
7375 chars[0] = 0xd800 + (ch >> 10);
7376 chars[1] = 0xdc00 + (ch & 0x3ff);
7377 charsize = 2;
7378 }
7379
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007381 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 buffer, Py_ARRAY_LENGTH(buffer),
7383 NULL, pusedDefaultChar);
7384 if (outsize > 0) {
7385 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7386 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 memcpy(out, buffer, outsize);
7389 out += outsize;
7390 continue;
7391 }
7392 }
7393 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7394 PyErr_SetFromWindowsErr(0);
7395 goto error;
7396 }
7397
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 rep = unicode_encode_call_errorhandler(
7399 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007400 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007401 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007402 if (rep == NULL)
7403 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007404 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405
7406 if (PyBytes_Check(rep)) {
7407 outsize = PyBytes_GET_SIZE(rep);
7408 if (outsize != 1) {
7409 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7410 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7411 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7412 Py_DECREF(rep);
7413 goto error;
7414 }
7415 out = PyBytes_AS_STRING(*outbytes) + offset;
7416 }
7417 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7418 out += outsize;
7419 }
7420 else {
7421 Py_ssize_t i;
7422 enum PyUnicode_Kind kind;
7423 void *data;
7424
7425 if (PyUnicode_READY(rep) < 0) {
7426 Py_DECREF(rep);
7427 goto error;
7428 }
7429
7430 outsize = PyUnicode_GET_LENGTH(rep);
7431 if (outsize != 1) {
7432 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7433 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7434 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7435 Py_DECREF(rep);
7436 goto error;
7437 }
7438 out = PyBytes_AS_STRING(*outbytes) + offset;
7439 }
7440 kind = PyUnicode_KIND(rep);
7441 data = PyUnicode_DATA(rep);
7442 for (i=0; i < outsize; i++) {
7443 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7444 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007445 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007446 encoding, unicode,
7447 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007448 "unable to encode error handler result to ASCII");
7449 Py_DECREF(rep);
7450 goto error;
7451 }
7452 *out = (unsigned char)ch;
7453 out++;
7454 }
7455 }
7456 Py_DECREF(rep);
7457 }
7458 /* write a NUL byte */
7459 *out = 0;
7460 outsize = out - PyBytes_AS_STRING(*outbytes);
7461 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7462 if (_PyBytes_Resize(outbytes, outsize) < 0)
7463 goto error;
7464 ret = 0;
7465
7466error:
7467 Py_XDECREF(encoding_obj);
7468 Py_XDECREF(errorHandler);
7469 Py_XDECREF(exc);
7470 return ret;
7471}
7472
Victor Stinner3a50e702011-10-18 21:21:00 +02007473static PyObject *
7474encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007475 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 const char *errors)
7477{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007478 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007480 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007482
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 if (PyUnicode_READY(unicode) < 0)
7484 return NULL;
7485 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007486
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 if (code_page < 0) {
7488 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7489 return NULL;
7490 }
7491
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007493 return PyBytes_FromStringAndSize(NULL, 0);
7494
Victor Stinner7581cef2011-11-03 22:32:33 +01007495 offset = 0;
7496 do
7497 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007500 chunks. */
7501 if (len > INT_MAX/2) {
7502 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007503 done = 0;
7504 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007505 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007506#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007507 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 done = 1;
7510 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511
Victor Stinner76a31a62011-11-04 00:05:13 +01007512 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007514 errors);
7515 if (ret == -2)
7516 ret = encode_code_page_errors(code_page, &outbytes,
7517 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007518 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007519 if (ret < 0) {
7520 Py_XDECREF(outbytes);
7521 return NULL;
7522 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523
Victor Stinner7581cef2011-11-03 22:32:33 +01007524 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 return outbytes;
7529}
7530
7531PyObject *
7532PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7533 Py_ssize_t size,
7534 const char *errors)
7535{
Victor Stinner7581cef2011-11-03 22:32:33 +01007536 PyObject *unicode, *res;
7537 unicode = PyUnicode_FromUnicode(p, size);
7538 if (unicode == NULL)
7539 return NULL;
7540 res = encode_code_page(CP_ACP, unicode, errors);
7541 Py_DECREF(unicode);
7542 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543}
7544
7545PyObject *
7546PyUnicode_EncodeCodePage(int code_page,
7547 PyObject *unicode,
7548 const char *errors)
7549{
Victor Stinner7581cef2011-11-03 22:32:33 +01007550 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007551}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007552
Alexander Belopolsky40018472011-02-26 01:02:56 +00007553PyObject *
7554PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007555{
7556 if (!PyUnicode_Check(unicode)) {
7557 PyErr_BadArgument();
7558 return NULL;
7559 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007560 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007561}
7562
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007563#undef NEED_RETRY
7564
Victor Stinner99b95382011-07-04 14:23:54 +02007565#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007566
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567/* --- Character Mapping Codec -------------------------------------------- */
7568
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569PyObject *
7570PyUnicode_DecodeCharmap(const char *s,
7571 Py_ssize_t size,
7572 PyObject *mapping,
7573 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007575 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007576 Py_ssize_t startinpos;
7577 Py_ssize_t endinpos;
7578 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007580 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007581 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007582 PyObject *errorHandler = NULL;
7583 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007584
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 /* Default to Latin-1 */
7586 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007589 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007593 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007594 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007596 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007597 Py_ssize_t maplen;
7598 enum PyUnicode_Kind kind;
7599 void *data;
7600 Py_UCS4 x;
7601
7602 if (PyUnicode_READY(mapping) < 0)
7603 return NULL;
7604
7605 maplen = PyUnicode_GET_LENGTH(mapping);
7606 data = PyUnicode_DATA(mapping);
7607 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 while (s < e) {
7609 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
Benjamin Peterson29060642009-01-31 22:14:21 +00007611 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007612 x = PyUnicode_READ(kind, data, ch);
7613 else
7614 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007616 if (x == 0xfffe)
7617 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 startinpos = s-starts;
7620 endinpos = startinpos+1;
7621 if (unicode_decode_call_errorhandler(
7622 errors, &errorHandler,
7623 "charmap", "character maps to <undefined>",
7624 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007625 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 goto onError;
7627 }
7628 continue;
7629 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007631 if (unicode_putchar(&v, &outpos, x) < 0)
7632 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007635 }
7636 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 while (s < e) {
7638 unsigned char ch = *s;
7639 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007640
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7642 w = PyLong_FromLong((long)ch);
7643 if (w == NULL)
7644 goto onError;
7645 x = PyObject_GetItem(mapping, w);
7646 Py_DECREF(w);
7647 if (x == NULL) {
7648 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7649 /* No mapping found means: mapping is undefined. */
7650 PyErr_Clear();
7651 x = Py_None;
7652 Py_INCREF(x);
7653 } else
7654 goto onError;
7655 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 /* Apply mapping */
7658 if (PyLong_Check(x)) {
7659 long value = PyLong_AS_LONG(x);
7660 if (value < 0 || value > 65535) {
7661 PyErr_SetString(PyExc_TypeError,
7662 "character mapping must be in range(65536)");
7663 Py_DECREF(x);
7664 goto onError;
7665 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007666 if (unicode_putchar(&v, &outpos, value) < 0)
7667 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 }
7669 else if (x == Py_None) {
7670 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 startinpos = s-starts;
7672 endinpos = startinpos+1;
7673 if (unicode_decode_call_errorhandler(
7674 errors, &errorHandler,
7675 "charmap", "character maps to <undefined>",
7676 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007677 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_DECREF(x);
7679 goto onError;
7680 }
7681 Py_DECREF(x);
7682 continue;
7683 }
7684 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007685 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007686
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007687 if (PyUnicode_READY(x) < 0)
7688 goto onError;
7689 targetsize = PyUnicode_GET_LENGTH(x);
7690
7691 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007693 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007694 PyUnicode_READ_CHAR(x, 0)) < 0)
7695 goto onError;
7696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 else if (targetsize > 1) {
7698 /* 1-n mapping */
7699 if (targetsize > extrachars) {
7700 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 Py_ssize_t needed = (targetsize - extrachars) + \
7702 (targetsize << 2);
7703 extrachars += needed;
7704 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007705 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007706 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 Py_DECREF(x);
7708 goto onError;
7709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007711 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7712 goto onError;
7713 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7714 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 extrachars -= targetsize;
7716 }
7717 /* 1-0 mapping: skip the character */
7718 }
7719 else {
7720 /* wrong return value */
7721 PyErr_SetString(PyExc_TypeError,
7722 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007723 Py_DECREF(x);
7724 goto onError;
7725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 Py_DECREF(x);
7727 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007730 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007731 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007732 Py_XDECREF(errorHandler);
7733 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007734 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007735
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737 Py_XDECREF(errorHandler);
7738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 Py_XDECREF(v);
7740 return NULL;
7741}
7742
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743/* Charmap encoding: the lookup table */
7744
Alexander Belopolsky40018472011-02-26 01:02:56 +00007745struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 PyObject_HEAD
7747 unsigned char level1[32];
7748 int count2, count3;
7749 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007750};
7751
7752static PyObject*
7753encoding_map_size(PyObject *obj, PyObject* args)
7754{
7755 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007756 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758}
7759
7760static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007761 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 PyDoc_STR("Return the size (in bytes) of this object") },
7763 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764};
7765
7766static void
7767encoding_map_dealloc(PyObject* o)
7768{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007770}
7771
7772static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 "EncodingMap", /*tp_name*/
7775 sizeof(struct encoding_map), /*tp_basicsize*/
7776 0, /*tp_itemsize*/
7777 /* methods */
7778 encoding_map_dealloc, /*tp_dealloc*/
7779 0, /*tp_print*/
7780 0, /*tp_getattr*/
7781 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007782 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 0, /*tp_repr*/
7784 0, /*tp_as_number*/
7785 0, /*tp_as_sequence*/
7786 0, /*tp_as_mapping*/
7787 0, /*tp_hash*/
7788 0, /*tp_call*/
7789 0, /*tp_str*/
7790 0, /*tp_getattro*/
7791 0, /*tp_setattro*/
7792 0, /*tp_as_buffer*/
7793 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7794 0, /*tp_doc*/
7795 0, /*tp_traverse*/
7796 0, /*tp_clear*/
7797 0, /*tp_richcompare*/
7798 0, /*tp_weaklistoffset*/
7799 0, /*tp_iter*/
7800 0, /*tp_iternext*/
7801 encoding_map_methods, /*tp_methods*/
7802 0, /*tp_members*/
7803 0, /*tp_getset*/
7804 0, /*tp_base*/
7805 0, /*tp_dict*/
7806 0, /*tp_descr_get*/
7807 0, /*tp_descr_set*/
7808 0, /*tp_dictoffset*/
7809 0, /*tp_init*/
7810 0, /*tp_alloc*/
7811 0, /*tp_new*/
7812 0, /*tp_free*/
7813 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814};
7815
7816PyObject*
7817PyUnicode_BuildEncodingMap(PyObject* string)
7818{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007819 PyObject *result;
7820 struct encoding_map *mresult;
7821 int i;
7822 int need_dict = 0;
7823 unsigned char level1[32];
7824 unsigned char level2[512];
7825 unsigned char *mlevel1, *mlevel2, *mlevel3;
7826 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007827 int kind;
7828 void *data;
7829 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007831 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832 PyErr_BadArgument();
7833 return NULL;
7834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 kind = PyUnicode_KIND(string);
7836 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 memset(level1, 0xFF, sizeof level1);
7838 memset(level2, 0xFF, sizeof level2);
7839
7840 /* If there isn't a one-to-one mapping of NULL to \0,
7841 or if there are non-BMP characters, we need to use
7842 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007844 need_dict = 1;
7845 for (i = 1; i < 256; i++) {
7846 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 ch = PyUnicode_READ(kind, data, i);
7848 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007849 need_dict = 1;
7850 break;
7851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 /* unmapped character */
7854 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007855 l1 = ch >> 11;
7856 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 if (level1[l1] == 0xFF)
7858 level1[l1] = count2++;
7859 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 }
7862
7863 if (count2 >= 0xFF || count3 >= 0xFF)
7864 need_dict = 1;
7865
7866 if (need_dict) {
7867 PyObject *result = PyDict_New();
7868 PyObject *key, *value;
7869 if (!result)
7870 return NULL;
7871 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007873 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874 if (!key || !value)
7875 goto failed1;
7876 if (PyDict_SetItem(result, key, value) == -1)
7877 goto failed1;
7878 Py_DECREF(key);
7879 Py_DECREF(value);
7880 }
7881 return result;
7882 failed1:
7883 Py_XDECREF(key);
7884 Py_XDECREF(value);
7885 Py_DECREF(result);
7886 return NULL;
7887 }
7888
7889 /* Create a three-level trie */
7890 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7891 16*count2 + 128*count3 - 1);
7892 if (!result)
7893 return PyErr_NoMemory();
7894 PyObject_Init(result, &EncodingMapType);
7895 mresult = (struct encoding_map*)result;
7896 mresult->count2 = count2;
7897 mresult->count3 = count3;
7898 mlevel1 = mresult->level1;
7899 mlevel2 = mresult->level23;
7900 mlevel3 = mresult->level23 + 16*count2;
7901 memcpy(mlevel1, level1, 32);
7902 memset(mlevel2, 0xFF, 16*count2);
7903 memset(mlevel3, 0, 128*count3);
7904 count3 = 0;
7905 for (i = 1; i < 256; i++) {
7906 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 /* unmapped character */
7909 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 o1 = PyUnicode_READ(kind, data, i)>>11;
7911 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912 i2 = 16*mlevel1[o1] + o2;
7913 if (mlevel2[i2] == 0xFF)
7914 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 i3 = 128*mlevel2[i2] + o3;
7917 mlevel3[i3] = i;
7918 }
7919 return result;
7920}
7921
7922static int
Victor Stinner22168992011-11-20 17:09:18 +01007923encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924{
7925 struct encoding_map *map = (struct encoding_map*)mapping;
7926 int l1 = c>>11;
7927 int l2 = (c>>7) & 0xF;
7928 int l3 = c & 0x7F;
7929 int i;
7930
Victor Stinner22168992011-11-20 17:09:18 +01007931 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933 if (c == 0)
7934 return 0;
7935 /* level 1*/
7936 i = map->level1[l1];
7937 if (i == 0xFF) {
7938 return -1;
7939 }
7940 /* level 2*/
7941 i = map->level23[16*i+l2];
7942 if (i == 0xFF) {
7943 return -1;
7944 }
7945 /* level 3 */
7946 i = map->level23[16*map->count2 + 128*i + l3];
7947 if (i == 0) {
7948 return -1;
7949 }
7950 return i;
7951}
7952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953/* Lookup the character ch in the mapping. If the character
7954 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007955 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007956static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007957charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958{
Christian Heimes217cfd12007-12-02 14:31:20 +00007959 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960 PyObject *x;
7961
7962 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 x = PyObject_GetItem(mapping, w);
7965 Py_DECREF(w);
7966 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7968 /* No mapping found means: mapping is undefined. */
7969 PyErr_Clear();
7970 x = Py_None;
7971 Py_INCREF(x);
7972 return x;
7973 } else
7974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007976 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007978 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 long value = PyLong_AS_LONG(x);
7980 if (value < 0 || value > 255) {
7981 PyErr_SetString(PyExc_TypeError,
7982 "character mapping must be in range(256)");
7983 Py_DECREF(x);
7984 return NULL;
7985 }
7986 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007988 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 /* wrong return value */
7992 PyErr_Format(PyExc_TypeError,
7993 "character mapping must return integer, bytes or None, not %.400s",
7994 x->ob_type->tp_name);
7995 Py_DECREF(x);
7996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
7998}
7999
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008001charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8004 /* exponentially overallocate to minimize reallocations */
8005 if (requiredsize < 2*outsize)
8006 requiredsize = 2*outsize;
8007 if (_PyBytes_Resize(outobj, requiredsize))
8008 return -1;
8009 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010}
8011
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008016 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 space is available. Return a new reference to the object that
8018 was put in the output buffer, or Py_None, if the mapping was undefined
8019 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008020 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008022charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008023 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008024{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 PyObject *rep;
8026 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008027 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028
Christian Heimes90aa7642007-12-19 02:45:37 +00008029 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 if (res == -1)
8033 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 if (outsize<requiredsize)
8035 if (charmapencode_resize(outobj, outpos, requiredsize))
8036 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008037 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 outstart[(*outpos)++] = (char)res;
8039 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 }
8041
8042 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008043 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 Py_DECREF(rep);
8047 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 if (PyLong_Check(rep)) {
8050 Py_ssize_t requiredsize = *outpos+1;
8051 if (outsize<requiredsize)
8052 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8053 Py_DECREF(rep);
8054 return enc_EXCEPTION;
8055 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008056 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 else {
8060 const char *repchars = PyBytes_AS_STRING(rep);
8061 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8062 Py_ssize_t requiredsize = *outpos+repsize;
8063 if (outsize<requiredsize)
8064 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8065 Py_DECREF(rep);
8066 return enc_EXCEPTION;
8067 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008068 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 memcpy(outstart + *outpos, repchars, repsize);
8070 *outpos += repsize;
8071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008072 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073 Py_DECREF(rep);
8074 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075}
8076
8077/* handle an error in PyUnicode_EncodeCharmap
8078 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079static int
8080charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008081 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008083 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008084 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085{
8086 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008087 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008088 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008089 enum PyUnicode_Kind kind;
8090 void *data;
8091 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008093 Py_ssize_t collstartpos = *inpos;
8094 Py_ssize_t collendpos = *inpos+1;
8095 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 char *encoding = "charmap";
8097 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008099 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008100 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008102 if (PyUnicode_READY(unicode) < 0)
8103 return -1;
8104 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105 /* find all unencodable characters */
8106 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008108 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008110 val = encoding_map_lookup(ch, mapping);
8111 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 break;
8113 ++collendpos;
8114 continue;
8115 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8118 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 if (rep==NULL)
8120 return -1;
8121 else if (rep!=Py_None) {
8122 Py_DECREF(rep);
8123 break;
8124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 }
8128 /* cache callback name lookup
8129 * (if not done yet, i.e. it's the first error) */
8130 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if ((errors==NULL) || (!strcmp(errors, "strict")))
8132 *known_errorHandler = 1;
8133 else if (!strcmp(errors, "replace"))
8134 *known_errorHandler = 2;
8135 else if (!strcmp(errors, "ignore"))
8136 *known_errorHandler = 3;
8137 else if (!strcmp(errors, "xmlcharrefreplace"))
8138 *known_errorHandler = 4;
8139 else
8140 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 }
8142 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008144 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 return -1;
8146 case 2: /* replace */
8147 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 x = charmapencode_output('?', mapping, res, respos);
8149 if (x==enc_EXCEPTION) {
8150 return -1;
8151 }
8152 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008153 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 return -1;
8155 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 }
8157 /* fall through */
8158 case 3: /* ignore */
8159 *inpos = collendpos;
8160 break;
8161 case 4: /* xmlcharrefreplace */
8162 /* generate replacement (temporarily (mis)uses p) */
8163 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 char buffer[2+29+1+1];
8165 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008166 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 for (cp = buffer; *cp; ++cp) {
8168 x = charmapencode_output(*cp, mapping, res, respos);
8169 if (x==enc_EXCEPTION)
8170 return -1;
8171 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008172 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 return -1;
8174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 }
8176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 *inpos = collendpos;
8178 break;
8179 default:
8180 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008181 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008185 if (PyBytes_Check(repunicode)) {
8186 /* Directly copy bytes result to output. */
8187 Py_ssize_t outsize = PyBytes_Size(*res);
8188 Py_ssize_t requiredsize;
8189 repsize = PyBytes_Size(repunicode);
8190 requiredsize = *respos + repsize;
8191 if (requiredsize > outsize)
8192 /* Make room for all additional bytes. */
8193 if (charmapencode_resize(res, respos, requiredsize)) {
8194 Py_DECREF(repunicode);
8195 return -1;
8196 }
8197 memcpy(PyBytes_AsString(*res) + *respos,
8198 PyBytes_AsString(repunicode), repsize);
8199 *respos += repsize;
8200 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008201 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008202 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008205 if (PyUnicode_READY(repunicode) < 0) {
8206 Py_DECREF(repunicode);
8207 return -1;
8208 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008209 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008210 data = PyUnicode_DATA(repunicode);
8211 kind = PyUnicode_KIND(repunicode);
8212 for (index = 0; index < repsize; index++) {
8213 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8214 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008216 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return -1;
8218 }
8219 else if (x==enc_FAILED) {
8220 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008221 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
8223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 }
8225 *inpos = newpos;
8226 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 }
8228 return 0;
8229}
8230
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232_PyUnicode_EncodeCharmap(PyObject *unicode,
8233 PyObject *mapping,
8234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 /* output object */
8237 PyObject *res = NULL;
8238 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008239 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008242 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 PyObject *errorHandler = NULL;
8244 PyObject *exc = NULL;
8245 /* the following variable is used for caching string comparisons
8246 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8247 * 3=ignore, 4=xmlcharrefreplace */
8248 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008250 if (PyUnicode_READY(unicode) < 0)
8251 return NULL;
8252 size = PyUnicode_GET_LENGTH(unicode);
8253
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 /* Default to Latin-1 */
8255 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 /* allocate enough for a simple encoding without
8259 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008260 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 if (res == NULL)
8262 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008263 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 if (x==enc_EXCEPTION) /* error */
8271 goto onError;
8272 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008273 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 &exc,
8275 &known_errorHandler, &errorHandler, errors,
8276 &res, &respos)) {
8277 goto onError;
8278 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008279 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 else
8281 /* done with this character => adjust input position */
8282 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008286 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008287 if (_PyBytes_Resize(&res, respos) < 0)
8288 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008289
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 Py_XDECREF(exc);
8291 Py_XDECREF(errorHandler);
8292 return res;
8293
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 Py_XDECREF(res);
8296 Py_XDECREF(exc);
8297 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 return NULL;
8299}
8300
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008301/* Deprecated */
8302PyObject *
8303PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8304 Py_ssize_t size,
8305 PyObject *mapping,
8306 const char *errors)
8307{
8308 PyObject *result;
8309 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8310 if (unicode == NULL)
8311 return NULL;
8312 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8313 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008314 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008315}
8316
Alexander Belopolsky40018472011-02-26 01:02:56 +00008317PyObject *
8318PyUnicode_AsCharmapString(PyObject *unicode,
8319 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320{
8321 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 PyErr_BadArgument();
8323 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326}
8327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008329static void
8330make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008332 Py_ssize_t startpos, Py_ssize_t endpos,
8333 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008336 *exceptionObject = _PyUnicodeTranslateError_Create(
8337 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
8339 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8341 goto onError;
8342 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8343 goto onError;
8344 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8345 goto onError;
8346 return;
8347 onError:
8348 Py_DECREF(*exceptionObject);
8349 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 }
8351}
8352
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354static void
8355raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008356 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357 Py_ssize_t startpos, Py_ssize_t endpos,
8358 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008359{
8360 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008361 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364}
8365
8366/* error handling callback helper:
8367 build arguments, call the callback and check the arguments,
8368 put the result into newpos and return the replacement string, which
8369 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static PyObject *
8371unicode_translate_call_errorhandler(const char *errors,
8372 PyObject **errorHandler,
8373 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008375 Py_ssize_t startpos, Py_ssize_t endpos,
8376 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008378 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008380 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 PyObject *restuple;
8382 PyObject *resunicode;
8383
8384 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 }
8389
8390 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394
8395 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008400 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 Py_DECREF(restuple);
8402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 }
8404 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 &resunicode, &i_newpos)) {
8406 Py_DECREF(restuple);
8407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008409 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008411 else
8412 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8415 Py_DECREF(restuple);
8416 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 Py_INCREF(resunicode);
8419 Py_DECREF(restuple);
8420 return resunicode;
8421}
8422
8423/* Lookup the character ch in the mapping and put the result in result,
8424 which must be decrefed by the caller.
8425 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
Christian Heimes217cfd12007-12-02 14:31:20 +00008429 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 PyObject *x;
8431
8432 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 x = PyObject_GetItem(mapping, w);
8435 Py_DECREF(w);
8436 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8438 /* No mapping found means: use 1:1 mapping. */
8439 PyErr_Clear();
8440 *result = NULL;
8441 return 0;
8442 } else
8443 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
8445 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 *result = x;
8447 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008449 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 long value = PyLong_AS_LONG(x);
8451 long max = PyUnicode_GetMax();
8452 if (value < 0 || value > max) {
8453 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008454 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 Py_DECREF(x);
8456 return -1;
8457 }
8458 *result = x;
8459 return 0;
8460 }
8461 else if (PyUnicode_Check(x)) {
8462 *result = x;
8463 return 0;
8464 }
8465 else {
8466 /* wrong return value */
8467 PyErr_SetString(PyExc_TypeError,
8468 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008469 Py_DECREF(x);
8470 return -1;
8471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472}
8473/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 if not reallocate and adjust various state variables.
8475 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008476static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008481 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 /* exponentially overallocate to minimize reallocations */
8483 if (requiredsize < 2 * oldsize)
8484 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8486 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 }
8490 return 0;
8491}
8492/* lookup the character, put the result in the output string and adjust
8493 various state variables. Return a new reference to the object that
8494 was put in the output buffer in *result, or Py_None, if the mapping was
8495 undefined (in which case no character was written).
8496 The called must decref result.
8497 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008498static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8500 PyObject *mapping, Py_UCS4 **output,
8501 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008503{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8505 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008510 }
8511 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008513 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 }
8517 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 Py_ssize_t repsize;
8519 if (PyUnicode_READY(*res) == -1)
8520 return -1;
8521 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 if (repsize==1) {
8523 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 }
8526 else if (repsize!=0) {
8527 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 Py_ssize_t requiredsize = *opos +
8529 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_ssize_t i;
8532 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 for(i = 0; i < repsize; i++)
8535 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008537 }
8538 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008540 return 0;
8541}
8542
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544_PyUnicode_TranslateCharmap(PyObject *input,
8545 PyObject *mapping,
8546 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 /* input object */
8549 char *idata;
8550 Py_ssize_t size, i;
8551 int kind;
8552 /* output buffer */
8553 Py_UCS4 *output = NULL;
8554 Py_ssize_t osize;
8555 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 char *reason = "character maps to <undefined>";
8559 PyObject *errorHandler = NULL;
8560 PyObject *exc = NULL;
8561 /* the following variable is used for caching string comparisons
8562 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8563 * 3=ignore, 4=xmlcharrefreplace */
8564 int known_errorHandler = -1;
8565
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 PyErr_BadArgument();
8568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008571 if (PyUnicode_READY(input) == -1)
8572 return NULL;
8573 idata = (char*)PyUnicode_DATA(input);
8574 kind = PyUnicode_KIND(input);
8575 size = PyUnicode_GET_LENGTH(input);
8576 i = 0;
8577
8578 if (size == 0) {
8579 Py_INCREF(input);
8580 return input;
8581 }
8582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 /* allocate enough for a simple 1:1 translation without
8584 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 osize = size;
8586 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8587 opos = 0;
8588 if (output == NULL) {
8589 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 /* try to encode it */
8595 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 if (charmaptranslate_output(input, i, mapping,
8597 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 Py_XDECREF(x);
8599 goto onError;
8600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008601 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 else { /* untranslatable character */
8605 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8606 Py_ssize_t repsize;
8607 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 Py_ssize_t collstart = i;
8611 Py_ssize_t collend = i+1;
8612 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 while (collend < size) {
8616 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 goto onError;
8618 Py_XDECREF(x);
8619 if (x!=Py_None)
8620 break;
8621 ++collend;
8622 }
8623 /* cache callback name lookup
8624 * (if not done yet, i.e. it's the first error) */
8625 if (known_errorHandler==-1) {
8626 if ((errors==NULL) || (!strcmp(errors, "strict")))
8627 known_errorHandler = 1;
8628 else if (!strcmp(errors, "replace"))
8629 known_errorHandler = 2;
8630 else if (!strcmp(errors, "ignore"))
8631 known_errorHandler = 3;
8632 else if (!strcmp(errors, "xmlcharrefreplace"))
8633 known_errorHandler = 4;
8634 else
8635 known_errorHandler = 0;
8636 }
8637 switch (known_errorHandler) {
8638 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 raise_translate_exception(&exc, input, collstart,
8640 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008641 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008642 case 2: /* replace */
8643 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 for (coll = collstart; coll<collend; coll++)
8645 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 /* fall through */
8647 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 break;
8650 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 /* generate replacement (temporarily (mis)uses i) */
8652 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 char buffer[2+29+1+1];
8654 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8656 if (charmaptranslate_makespace(&output, &osize,
8657 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 goto onError;
8659 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 break;
8664 default:
8665 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 reason, input, &exc,
8667 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008668 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008670 if (PyUnicode_READY(repunicode) < 0) {
8671 Py_DECREF(repunicode);
8672 goto onError;
8673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 repsize = PyUnicode_GET_LENGTH(repunicode);
8676 if (charmaptranslate_makespace(&output, &osize,
8677 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 Py_DECREF(repunicode);
8679 goto onError;
8680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 for (uni2 = 0; repsize-->0; ++uni2)
8682 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8683 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008685 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 }
8687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8689 if (!res)
8690 goto onError;
8691 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 Py_XDECREF(exc);
8693 Py_XDECREF(errorHandler);
8694 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 Py_XDECREF(exc);
8699 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 return NULL;
8701}
8702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703/* Deprecated. Use PyUnicode_Translate instead. */
8704PyObject *
8705PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8706 Py_ssize_t size,
8707 PyObject *mapping,
8708 const char *errors)
8709{
8710 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8711 if (!unicode)
8712 return NULL;
8713 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8714}
8715
Alexander Belopolsky40018472011-02-26 01:02:56 +00008716PyObject *
8717PyUnicode_Translate(PyObject *str,
8718 PyObject *mapping,
8719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720{
8721 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008722
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 str = PyUnicode_FromObject(str);
8724 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 Py_DECREF(str);
8728 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008729
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 Py_XDECREF(str);
8732 return NULL;
8733}
Tim Petersced69f82003-09-16 20:30:58 +00008734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008736fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737{
8738 /* No need to call PyUnicode_READY(self) because this function is only
8739 called as a callback from fixup() which does it already. */
8740 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8741 const int kind = PyUnicode_KIND(self);
8742 void *data = PyUnicode_DATA(self);
8743 Py_UCS4 maxchar = 0, ch, fixed;
8744 Py_ssize_t i;
8745
8746 for (i = 0; i < len; ++i) {
8747 ch = PyUnicode_READ(kind, data, i);
8748 fixed = 0;
8749 if (ch > 127) {
8750 if (Py_UNICODE_ISSPACE(ch))
8751 fixed = ' ';
8752 else {
8753 const int decimal = Py_UNICODE_TODECIMAL(ch);
8754 if (decimal >= 0)
8755 fixed = '0' + decimal;
8756 }
8757 if (fixed != 0) {
8758 if (fixed > maxchar)
8759 maxchar = fixed;
8760 PyUnicode_WRITE(kind, data, i, fixed);
8761 }
8762 else if (ch > maxchar)
8763 maxchar = ch;
8764 }
8765 else if (ch > maxchar)
8766 maxchar = ch;
8767 }
8768
8769 return maxchar;
8770}
8771
8772PyObject *
8773_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8774{
8775 if (!PyUnicode_Check(unicode)) {
8776 PyErr_BadInternalCall();
8777 return NULL;
8778 }
8779 if (PyUnicode_READY(unicode) == -1)
8780 return NULL;
8781 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8782 /* If the string is already ASCII, just return the same string */
8783 Py_INCREF(unicode);
8784 return unicode;
8785 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008786 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787}
8788
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008789PyObject *
8790PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8791 Py_ssize_t length)
8792{
Victor Stinnerf0124502011-11-21 23:12:56 +01008793 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008794 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008795 Py_UCS4 maxchar;
8796 enum PyUnicode_Kind kind;
8797 void *data;
8798
8799 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008800 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008801 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008802 if (ch > 127) {
8803 int decimal = Py_UNICODE_TODECIMAL(ch);
8804 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008805 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008806 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008807 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008808 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008809
8810 /* Copy to a new string */
8811 decimal = PyUnicode_New(length, maxchar);
8812 if (decimal == NULL)
8813 return decimal;
8814 kind = PyUnicode_KIND(decimal);
8815 data = PyUnicode_DATA(decimal);
8816 /* Iterate over code points */
8817 for (i = 0; i < length; i++) {
8818 Py_UNICODE ch = s[i];
8819 if (ch > 127) {
8820 int decimal = Py_UNICODE_TODECIMAL(ch);
8821 if (decimal >= 0)
8822 ch = '0' + decimal;
8823 }
8824 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008826 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008827}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008828/* --- Decimal Encoder ---------------------------------------------------- */
8829
Alexander Belopolsky40018472011-02-26 01:02:56 +00008830int
8831PyUnicode_EncodeDecimal(Py_UNICODE *s,
8832 Py_ssize_t length,
8833 char *output,
8834 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008835{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008836 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008837 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008838 enum PyUnicode_Kind kind;
8839 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008840
8841 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 PyErr_BadArgument();
8843 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008844 }
8845
Victor Stinner42bf7752011-11-21 22:52:58 +01008846 unicode = PyUnicode_FromUnicode(s, length);
8847 if (unicode == NULL)
8848 return -1;
8849
Victor Stinner6345be92011-11-25 20:09:01 +01008850 if (PyUnicode_READY(unicode) < 0) {
8851 Py_DECREF(unicode);
8852 return -1;
8853 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008854 kind = PyUnicode_KIND(unicode);
8855 data = PyUnicode_DATA(unicode);
8856
Victor Stinnerb84d7232011-11-22 01:50:07 +01008857 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008858 PyObject *exc;
8859 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008861 Py_ssize_t startpos;
8862
8863 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008864
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008866 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008867 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008869 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 decimal = Py_UNICODE_TODECIMAL(ch);
8871 if (decimal >= 0) {
8872 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008873 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 continue;
8875 }
8876 if (0 < ch && ch < 256) {
8877 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008878 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 continue;
8880 }
Victor Stinner6345be92011-11-25 20:09:01 +01008881
Victor Stinner42bf7752011-11-21 22:52:58 +01008882 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008883 exc = NULL;
8884 raise_encode_exception(&exc, "decimal", unicode,
8885 startpos, startpos+1,
8886 "invalid decimal Unicode string");
8887 Py_XDECREF(exc);
8888 Py_DECREF(unicode);
8889 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008890 }
8891 /* 0-terminate the output string */
8892 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008893 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008894 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008895}
8896
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897/* --- Helpers ------------------------------------------------------------ */
8898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008900any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008901 Py_ssize_t start,
8902 Py_ssize_t end)
8903{
8904 int kind1, kind2, kind;
8905 void *buf1, *buf2;
8906 Py_ssize_t len1, len2, result;
8907
8908 kind1 = PyUnicode_KIND(s1);
8909 kind2 = PyUnicode_KIND(s2);
8910 kind = kind1 > kind2 ? kind1 : kind2;
8911 buf1 = PyUnicode_DATA(s1);
8912 buf2 = PyUnicode_DATA(s2);
8913 if (kind1 != kind)
8914 buf1 = _PyUnicode_AsKind(s1, kind);
8915 if (!buf1)
8916 return -2;
8917 if (kind2 != kind)
8918 buf2 = _PyUnicode_AsKind(s2, kind);
8919 if (!buf2) {
8920 if (kind1 != kind) PyMem_Free(buf1);
8921 return -2;
8922 }
8923 len1 = PyUnicode_GET_LENGTH(s1);
8924 len2 = PyUnicode_GET_LENGTH(s2);
8925
Victor Stinner794d5672011-10-10 03:21:36 +02008926 if (direction > 0) {
8927 switch(kind) {
8928 case PyUnicode_1BYTE_KIND:
8929 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8930 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8931 else
8932 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8933 break;
8934 case PyUnicode_2BYTE_KIND:
8935 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8936 break;
8937 case PyUnicode_4BYTE_KIND:
8938 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8939 break;
8940 default:
8941 assert(0); result = -2;
8942 }
8943 }
8944 else {
8945 switch(kind) {
8946 case PyUnicode_1BYTE_KIND:
8947 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8948 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8949 else
8950 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8951 break;
8952 case PyUnicode_2BYTE_KIND:
8953 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8954 break;
8955 case PyUnicode_4BYTE_KIND:
8956 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8957 break;
8958 default:
8959 assert(0); result = -2;
8960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 }
8962
8963 if (kind1 != kind)
8964 PyMem_Free(buf1);
8965 if (kind2 != kind)
8966 PyMem_Free(buf2);
8967
8968 return result;
8969}
8970
8971Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008972_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 Py_ssize_t n_buffer,
8974 void *digits, Py_ssize_t n_digits,
8975 Py_ssize_t min_width,
8976 const char *grouping,
8977 const char *thousands_sep)
8978{
8979 switch(kind) {
8980 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008981 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8982 return _PyUnicode_ascii_InsertThousandsGrouping(
8983 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8984 min_width, grouping, thousands_sep);
8985 else
8986 return _PyUnicode_ucs1_InsertThousandsGrouping(
8987 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8988 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 case PyUnicode_2BYTE_KIND:
8990 return _PyUnicode_ucs2_InsertThousandsGrouping(
8991 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8992 min_width, grouping, thousands_sep);
8993 case PyUnicode_4BYTE_KIND:
8994 return _PyUnicode_ucs4_InsertThousandsGrouping(
8995 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8996 min_width, grouping, thousands_sep);
8997 }
8998 assert(0);
8999 return -1;
9000}
9001
9002
Thomas Wouters477c8d52006-05-27 19:21:47 +00009003/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009004#define ADJUST_INDICES(start, end, len) \
9005 if (end > len) \
9006 end = len; \
9007 else if (end < 0) { \
9008 end += len; \
9009 if (end < 0) \
9010 end = 0; \
9011 } \
9012 if (start < 0) { \
9013 start += len; \
9014 if (start < 0) \
9015 start = 0; \
9016 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009017
Alexander Belopolsky40018472011-02-26 01:02:56 +00009018Py_ssize_t
9019PyUnicode_Count(PyObject *str,
9020 PyObject *substr,
9021 Py_ssize_t start,
9022 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009024 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009025 PyObject* str_obj;
9026 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 int kind1, kind2, kind;
9028 void *buf1 = NULL, *buf2 = NULL;
9029 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009030
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009031 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009034 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009035 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 Py_DECREF(str_obj);
9037 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 }
Tim Petersced69f82003-09-16 20:30:58 +00009039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 kind1 = PyUnicode_KIND(str_obj);
9041 kind2 = PyUnicode_KIND(sub_obj);
9042 kind = kind1 > kind2 ? kind1 : kind2;
9043 buf1 = PyUnicode_DATA(str_obj);
9044 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009045 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 if (!buf1)
9047 goto onError;
9048 buf2 = PyUnicode_DATA(sub_obj);
9049 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009050 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 if (!buf2)
9052 goto onError;
9053 len1 = PyUnicode_GET_LENGTH(str_obj);
9054 len2 = PyUnicode_GET_LENGTH(sub_obj);
9055
9056 ADJUST_INDICES(start, end, len1);
9057 switch(kind) {
9058 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009059 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9060 result = asciilib_count(
9061 ((Py_UCS1*)buf1) + start, end - start,
9062 buf2, len2, PY_SSIZE_T_MAX
9063 );
9064 else
9065 result = ucs1lib_count(
9066 ((Py_UCS1*)buf1) + start, end - start,
9067 buf2, len2, PY_SSIZE_T_MAX
9068 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 break;
9070 case PyUnicode_2BYTE_KIND:
9071 result = ucs2lib_count(
9072 ((Py_UCS2*)buf1) + start, end - start,
9073 buf2, len2, PY_SSIZE_T_MAX
9074 );
9075 break;
9076 case PyUnicode_4BYTE_KIND:
9077 result = ucs4lib_count(
9078 ((Py_UCS4*)buf1) + start, end - start,
9079 buf2, len2, PY_SSIZE_T_MAX
9080 );
9081 break;
9082 default:
9083 assert(0); result = 0;
9084 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009085
9086 Py_DECREF(sub_obj);
9087 Py_DECREF(str_obj);
9088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 if (kind1 != kind)
9090 PyMem_Free(buf1);
9091 if (kind2 != kind)
9092 PyMem_Free(buf2);
9093
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 onError:
9096 Py_DECREF(sub_obj);
9097 Py_DECREF(str_obj);
9098 if (kind1 != kind && buf1)
9099 PyMem_Free(buf1);
9100 if (kind2 != kind && buf2)
9101 PyMem_Free(buf2);
9102 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103}
9104
Alexander Belopolsky40018472011-02-26 01:02:56 +00009105Py_ssize_t
9106PyUnicode_Find(PyObject *str,
9107 PyObject *sub,
9108 Py_ssize_t start,
9109 Py_ssize_t end,
9110 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009112 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009113
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009117 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 Py_DECREF(str);
9120 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121 }
Tim Petersced69f82003-09-16 20:30:58 +00009122
Victor Stinner794d5672011-10-10 03:21:36 +02009123 result = any_find_slice(direction,
9124 str, sub, start, end
9125 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009126
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009128 Py_DECREF(sub);
9129
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 return result;
9131}
9132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133Py_ssize_t
9134PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9135 Py_ssize_t start, Py_ssize_t end,
9136 int direction)
9137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009139 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 if (PyUnicode_READY(str) == -1)
9141 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009142 if (start < 0 || end < 0) {
9143 PyErr_SetString(PyExc_IndexError, "string index out of range");
9144 return -2;
9145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 if (end > PyUnicode_GET_LENGTH(str))
9147 end = PyUnicode_GET_LENGTH(str);
9148 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009149 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9150 kind, end-start, ch, direction);
9151 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009153 else
9154 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155}
9156
Alexander Belopolsky40018472011-02-26 01:02:56 +00009157static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158tailmatch(PyObject *self,
9159 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009160 Py_ssize_t start,
9161 Py_ssize_t end,
9162 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 int kind_self;
9165 int kind_sub;
9166 void *data_self;
9167 void *data_sub;
9168 Py_ssize_t offset;
9169 Py_ssize_t i;
9170 Py_ssize_t end_sub;
9171
9172 if (PyUnicode_READY(self) == -1 ||
9173 PyUnicode_READY(substring) == -1)
9174 return 0;
9175
9176 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177 return 1;
9178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9180 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 kind_self = PyUnicode_KIND(self);
9185 data_self = PyUnicode_DATA(self);
9186 kind_sub = PyUnicode_KIND(substring);
9187 data_sub = PyUnicode_DATA(substring);
9188 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9189
9190 if (direction > 0)
9191 offset = end;
9192 else
9193 offset = start;
9194
9195 if (PyUnicode_READ(kind_self, data_self, offset) ==
9196 PyUnicode_READ(kind_sub, data_sub, 0) &&
9197 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9198 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9199 /* If both are of the same kind, memcmp is sufficient */
9200 if (kind_self == kind_sub) {
9201 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009202 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 data_sub,
9204 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009205 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 }
9207 /* otherwise we have to compare each character by first accesing it */
9208 else {
9209 /* We do not need to compare 0 and len(substring)-1 because
9210 the if statement above ensured already that they are equal
9211 when we end up here. */
9212 // TODO: honor direction and do a forward or backwards search
9213 for (i = 1; i < end_sub; ++i) {
9214 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9215 PyUnicode_READ(kind_sub, data_sub, i))
9216 return 0;
9217 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 }
9221
9222 return 0;
9223}
9224
Alexander Belopolsky40018472011-02-26 01:02:56 +00009225Py_ssize_t
9226PyUnicode_Tailmatch(PyObject *str,
9227 PyObject *substr,
9228 Py_ssize_t start,
9229 Py_ssize_t end,
9230 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009232 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 str = PyUnicode_FromObject(str);
9235 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 substr = PyUnicode_FromObject(substr);
9238 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 Py_DECREF(str);
9240 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 }
Tim Petersced69f82003-09-16 20:30:58 +00009242
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009243 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009244 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 Py_DECREF(str);
9246 Py_DECREF(substr);
9247 return result;
9248}
9249
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250/* Apply fixfct filter to the Unicode object self and return a
9251 reference to the modified object */
9252
Alexander Belopolsky40018472011-02-26 01:02:56 +00009253static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009254fixup(PyObject *self,
9255 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 PyObject *u;
9258 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259
Victor Stinner87af4f22011-11-21 23:03:47 +01009260 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009263 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 /* fix functions return the new maximum character in a string,
9266 if the kind of the resulting unicode object does not change,
9267 everything is fine. Otherwise we need to change the string kind
9268 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009269 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 if (maxchar_new == 0)
9271 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9272 else if (maxchar_new <= 127)
9273 maxchar_new = 127;
9274 else if (maxchar_new <= 255)
9275 maxchar_new = 255;
9276 else if (maxchar_new <= 65535)
9277 maxchar_new = 65535;
9278 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009279 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280
9281 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009282 /* fixfct should return TRUE if it modified the buffer. If
9283 FALSE, return a reference to the original buffer instead
9284 (to save space, not time) */
9285 Py_INCREF(self);
9286 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009287 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 else if (maxchar_new == maxchar_old) {
9290 return u;
9291 }
9292 else {
9293 /* In case the maximum character changed, we need to
9294 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009295 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 if (v == NULL) {
9297 Py_DECREF(u);
9298 return NULL;
9299 }
9300 if (maxchar_new > maxchar_old) {
9301 /* If the maxchar increased so that the kind changed, not all
9302 characters are representable anymore and we need to fix the
9303 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009304 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009305 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9307 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009308 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009309 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009311
9312 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009313 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 return v;
9315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316}
9317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009319fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 /* No need to call PyUnicode_READY(self) because this function is only
9322 called as a callback from fixup() which does it already. */
9323 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9324 const int kind = PyUnicode_KIND(self);
9325 void *data = PyUnicode_DATA(self);
9326 int touched = 0;
9327 Py_UCS4 maxchar = 0;
9328 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 for (i = 0; i < len; ++i) {
9331 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9332 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9333 if (up != ch) {
9334 if (up > maxchar)
9335 maxchar = up;
9336 PyUnicode_WRITE(kind, data, i, up);
9337 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 else if (ch > maxchar)
9340 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 }
9342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 if (touched)
9344 return maxchar;
9345 else
9346 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347}
9348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009350fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009351{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9353 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9354 const int kind = PyUnicode_KIND(self);
9355 void *data = PyUnicode_DATA(self);
9356 int touched = 0;
9357 Py_UCS4 maxchar = 0;
9358 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 for(i = 0; i < len; ++i) {
9361 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9362 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9363 if (lo != ch) {
9364 if (lo > maxchar)
9365 maxchar = lo;
9366 PyUnicode_WRITE(kind, data, i, lo);
9367 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 else if (ch > maxchar)
9370 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 }
9372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 if (touched)
9374 return maxchar;
9375 else
9376 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377}
9378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009380fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9383 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9384 const int kind = PyUnicode_KIND(self);
9385 void *data = PyUnicode_DATA(self);
9386 int touched = 0;
9387 Py_UCS4 maxchar = 0;
9388 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 for(i = 0; i < len; ++i) {
9391 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9392 Py_UCS4 nu = 0;
9393
9394 if (Py_UNICODE_ISUPPER(ch))
9395 nu = Py_UNICODE_TOLOWER(ch);
9396 else if (Py_UNICODE_ISLOWER(ch))
9397 nu = Py_UNICODE_TOUPPER(ch);
9398
9399 if (nu != 0) {
9400 if (nu > maxchar)
9401 maxchar = nu;
9402 PyUnicode_WRITE(kind, data, i, nu);
9403 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 else if (ch > maxchar)
9406 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 }
9408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009409 if (touched)
9410 return maxchar;
9411 else
9412 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413}
9414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009416fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9419 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9420 const int kind = PyUnicode_KIND(self);
9421 void *data = PyUnicode_DATA(self);
9422 int touched = 0;
9423 Py_UCS4 maxchar = 0;
9424 Py_ssize_t i = 0;
9425 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009426
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009427 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429
9430 ch = PyUnicode_READ(kind, data, i);
9431 if (!Py_UNICODE_ISUPPER(ch)) {
9432 maxchar = Py_UNICODE_TOUPPER(ch);
9433 PyUnicode_WRITE(kind, data, i, maxchar);
9434 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 ++i;
9437 for(; i < len; ++i) {
9438 ch = PyUnicode_READ(kind, data, i);
9439 if (!Py_UNICODE_ISLOWER(ch)) {
9440 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9441 if (lo > maxchar)
9442 maxchar = lo;
9443 PyUnicode_WRITE(kind, data, i, lo);
9444 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 else if (ch > maxchar)
9447 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449
9450 if (touched)
9451 return maxchar;
9452 else
9453 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454}
9455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009457fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9460 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9461 const int kind = PyUnicode_KIND(self);
9462 void *data = PyUnicode_DATA(self);
9463 Py_UCS4 maxchar = 0;
9464 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 int previous_is_cased;
9466
9467 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 if (len == 1) {
9469 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9470 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9471 if (ti != ch) {
9472 PyUnicode_WRITE(kind, data, i, ti);
9473 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 }
9475 else
9476 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 for(; i < len; ++i) {
9480 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9481 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009482
Benjamin Peterson29060642009-01-31 22:14:21 +00009483 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 nu = Py_UNICODE_TOTITLE(ch);
9487
9488 if (nu > maxchar)
9489 maxchar = nu;
9490 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009491
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 if (Py_UNICODE_ISLOWER(ch) ||
9493 Py_UNICODE_ISUPPER(ch) ||
9494 Py_UNICODE_ISTITLE(ch))
9495 previous_is_cased = 1;
9496 else
9497 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500}
9501
Tim Peters8ce9f162004-08-27 01:49:32 +00009502PyObject *
9503PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009506 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009508 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009509 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9510 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009511 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009513 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009515 int use_memcpy;
9516 unsigned char *res_data = NULL, *sep_data = NULL;
9517 PyObject *last_obj;
9518 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
Tim Peters05eba1f2004-08-27 21:32:02 +00009520 fseq = PySequence_Fast(seq, "");
9521 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009522 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009523 }
9524
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009525 /* NOTE: the following code can't call back into Python code,
9526 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009527 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009528
Tim Peters05eba1f2004-08-27 21:32:02 +00009529 seqlen = PySequence_Fast_GET_SIZE(fseq);
9530 /* If empty sequence, return u"". */
9531 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009532 Py_DECREF(fseq);
9533 Py_INCREF(unicode_empty);
9534 res = unicode_empty;
9535 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009536 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009537
Tim Peters05eba1f2004-08-27 21:32:02 +00009538 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009539 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009540 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009541 if (seqlen == 1) {
9542 if (PyUnicode_CheckExact(items[0])) {
9543 res = items[0];
9544 Py_INCREF(res);
9545 Py_DECREF(fseq);
9546 return res;
9547 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009548 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009549 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009550 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009551 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009552 /* Set up sep and seplen */
9553 if (separator == NULL) {
9554 /* fall back to a blank space separator */
9555 sep = PyUnicode_FromOrdinal(' ');
9556 if (!sep)
9557 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009558 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009559 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009560 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009561 else {
9562 if (!PyUnicode_Check(separator)) {
9563 PyErr_Format(PyExc_TypeError,
9564 "separator: expected str instance,"
9565 " %.80s found",
9566 Py_TYPE(separator)->tp_name);
9567 goto onError;
9568 }
9569 if (PyUnicode_READY(separator))
9570 goto onError;
9571 sep = separator;
9572 seplen = PyUnicode_GET_LENGTH(separator);
9573 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9574 /* inc refcount to keep this code path symmetric with the
9575 above case of a blank separator */
9576 Py_INCREF(sep);
9577 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009578 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009579 }
9580
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 /* There are at least two things to join, or else we have a subclass
9582 * of str in the sequence.
9583 * Do a pre-pass to figure out the total amount of space we'll
9584 * need (sz), and see whether all argument are strings.
9585 */
9586 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009587#ifdef Py_DEBUG
9588 use_memcpy = 0;
9589#else
9590 use_memcpy = 1;
9591#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009592 for (i = 0; i < seqlen; i++) {
9593 const Py_ssize_t old_sz = sz;
9594 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 if (!PyUnicode_Check(item)) {
9596 PyErr_Format(PyExc_TypeError,
9597 "sequence item %zd: expected str instance,"
9598 " %.80s found",
9599 i, Py_TYPE(item)->tp_name);
9600 goto onError;
9601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 if (PyUnicode_READY(item) == -1)
9603 goto onError;
9604 sz += PyUnicode_GET_LENGTH(item);
9605 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009606 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009607 if (i != 0)
9608 sz += seplen;
9609 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9610 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009611 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009612 goto onError;
9613 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 if (use_memcpy && last_obj != NULL) {
9615 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9616 use_memcpy = 0;
9617 }
9618 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009619 }
Tim Petersced69f82003-09-16 20:30:58 +00009620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009622 if (res == NULL)
9623 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009624
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009625 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009626#ifdef Py_DEBUG
9627 use_memcpy = 0;
9628#else
9629 if (use_memcpy) {
9630 res_data = PyUnicode_1BYTE_DATA(res);
9631 kind = PyUnicode_KIND(res);
9632 if (seplen != 0)
9633 sep_data = PyUnicode_1BYTE_DATA(sep);
9634 }
9635#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009637 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009638 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009639 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009640 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009641 if (use_memcpy) {
9642 Py_MEMCPY(res_data,
9643 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009644 kind * seplen);
9645 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009646 }
9647 else {
9648 copy_characters(res, res_offset, sep, 0, seplen);
9649 res_offset += seplen;
9650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009651 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009652 itemlen = PyUnicode_GET_LENGTH(item);
9653 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009654 if (use_memcpy) {
9655 Py_MEMCPY(res_data,
9656 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009657 kind * itemlen);
9658 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009659 }
9660 else {
9661 copy_characters(res, res_offset, item, 0, itemlen);
9662 res_offset += itemlen;
9663 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009664 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009665 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009666 if (use_memcpy)
9667 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009668 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009669 else
9670 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009671
Tim Peters05eba1f2004-08-27 21:32:02 +00009672 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009674 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009678 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009680 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681 return NULL;
9682}
9683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684#define FILL(kind, data, value, start, length) \
9685 do { \
9686 Py_ssize_t i_ = 0; \
9687 assert(kind != PyUnicode_WCHAR_KIND); \
9688 switch ((kind)) { \
9689 case PyUnicode_1BYTE_KIND: { \
9690 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9691 memset(to_, (unsigned char)value, length); \
9692 break; \
9693 } \
9694 case PyUnicode_2BYTE_KIND: { \
9695 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9696 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9697 break; \
9698 } \
9699 default: { \
9700 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9701 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9702 break; \
9703 } \
9704 } \
9705 } while (0)
9706
Victor Stinner9310abb2011-10-05 00:59:23 +02009707static PyObject *
9708pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009709 Py_ssize_t left,
9710 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 PyObject *u;
9714 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009715 int kind;
9716 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717
9718 if (left < 0)
9719 left = 0;
9720 if (right < 0)
9721 right = 0;
9722
Tim Peters7a29bd52001-09-12 03:03:31 +00009723 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 Py_INCREF(self);
9725 return self;
9726 }
9727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9729 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009730 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9731 return NULL;
9732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9734 if (fill > maxchar)
9735 maxchar = fill;
9736 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009737 if (!u)
9738 return NULL;
9739
9740 kind = PyUnicode_KIND(u);
9741 data = PyUnicode_DATA(u);
9742 if (left)
9743 FILL(kind, data, fill, 0, left);
9744 if (right)
9745 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009746 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009747 assert(_PyUnicode_CheckConsistency(u, 1));
9748 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009749}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751
Alexander Belopolsky40018472011-02-26 01:02:56 +00009752PyObject *
9753PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756
9757 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 switch(PyUnicode_KIND(string)) {
9762 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009763 if (PyUnicode_IS_ASCII(string))
9764 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009765 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009766 PyUnicode_GET_LENGTH(string), keepends);
9767 else
9768 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009769 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 break;
9772 case PyUnicode_2BYTE_KIND:
9773 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 PyUnicode_GET_LENGTH(string), keepends);
9776 break;
9777 case PyUnicode_4BYTE_KIND:
9778 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009779 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 PyUnicode_GET_LENGTH(string), keepends);
9781 break;
9782 default:
9783 assert(0);
9784 list = 0;
9785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 Py_DECREF(string);
9787 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788}
9789
Alexander Belopolsky40018472011-02-26 01:02:56 +00009790static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009791split(PyObject *self,
9792 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009793 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 int kind1, kind2, kind;
9796 void *buf1, *buf2;
9797 Py_ssize_t len1, len2;
9798 PyObject* out;
9799
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009801 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 if (PyUnicode_READY(self) == -1)
9804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 if (substring == NULL)
9807 switch(PyUnicode_KIND(self)) {
9808 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009809 if (PyUnicode_IS_ASCII(self))
9810 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009812 PyUnicode_GET_LENGTH(self), maxcount
9813 );
9814 else
9815 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009816 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 PyUnicode_GET_LENGTH(self), maxcount
9818 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 case PyUnicode_2BYTE_KIND:
9820 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009821 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 PyUnicode_GET_LENGTH(self), maxcount
9823 );
9824 case PyUnicode_4BYTE_KIND:
9825 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 PyUnicode_GET_LENGTH(self), maxcount
9828 );
9829 default:
9830 assert(0);
9831 return NULL;
9832 }
9833
9834 if (PyUnicode_READY(substring) == -1)
9835 return NULL;
9836
9837 kind1 = PyUnicode_KIND(self);
9838 kind2 = PyUnicode_KIND(substring);
9839 kind = kind1 > kind2 ? kind1 : kind2;
9840 buf1 = PyUnicode_DATA(self);
9841 buf2 = PyUnicode_DATA(substring);
9842 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009843 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 if (!buf1)
9845 return NULL;
9846 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 if (!buf2) {
9849 if (kind1 != kind) PyMem_Free(buf1);
9850 return NULL;
9851 }
9852 len1 = PyUnicode_GET_LENGTH(self);
9853 len2 = PyUnicode_GET_LENGTH(substring);
9854
9855 switch(kind) {
9856 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009857 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9858 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009859 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009860 else
9861 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009862 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 break;
9864 case PyUnicode_2BYTE_KIND:
9865 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009866 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 break;
9868 case PyUnicode_4BYTE_KIND:
9869 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009871 break;
9872 default:
9873 out = NULL;
9874 }
9875 if (kind1 != kind)
9876 PyMem_Free(buf1);
9877 if (kind2 != kind)
9878 PyMem_Free(buf2);
9879 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880}
9881
Alexander Belopolsky40018472011-02-26 01:02:56 +00009882static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009883rsplit(PyObject *self,
9884 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009885 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009886{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 int kind1, kind2, kind;
9888 void *buf1, *buf2;
9889 Py_ssize_t len1, len2;
9890 PyObject* out;
9891
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009892 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009893 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 if (PyUnicode_READY(self) == -1)
9896 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (substring == NULL)
9899 switch(PyUnicode_KIND(self)) {
9900 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009901 if (PyUnicode_IS_ASCII(self))
9902 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009903 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009904 PyUnicode_GET_LENGTH(self), maxcount
9905 );
9906 else
9907 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009908 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009909 PyUnicode_GET_LENGTH(self), maxcount
9910 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 case PyUnicode_2BYTE_KIND:
9912 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009914 PyUnicode_GET_LENGTH(self), maxcount
9915 );
9916 case PyUnicode_4BYTE_KIND:
9917 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 PyUnicode_GET_LENGTH(self), maxcount
9920 );
9921 default:
9922 assert(0);
9923 return NULL;
9924 }
9925
9926 if (PyUnicode_READY(substring) == -1)
9927 return NULL;
9928
9929 kind1 = PyUnicode_KIND(self);
9930 kind2 = PyUnicode_KIND(substring);
9931 kind = kind1 > kind2 ? kind1 : kind2;
9932 buf1 = PyUnicode_DATA(self);
9933 buf2 = PyUnicode_DATA(substring);
9934 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009935 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 if (!buf1)
9937 return NULL;
9938 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009939 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (!buf2) {
9941 if (kind1 != kind) PyMem_Free(buf1);
9942 return NULL;
9943 }
9944 len1 = PyUnicode_GET_LENGTH(self);
9945 len2 = PyUnicode_GET_LENGTH(substring);
9946
9947 switch(kind) {
9948 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009949 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9950 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009952 else
9953 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009954 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 break;
9956 case PyUnicode_2BYTE_KIND:
9957 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009958 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 break;
9960 case PyUnicode_4BYTE_KIND:
9961 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009962 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 break;
9964 default:
9965 out = NULL;
9966 }
9967 if (kind1 != kind)
9968 PyMem_Free(buf1);
9969 if (kind2 != kind)
9970 PyMem_Free(buf2);
9971 return out;
9972}
9973
9974static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009975anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9976 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977{
9978 switch(kind) {
9979 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9981 return asciilib_find(buf1, len1, buf2, len2, offset);
9982 else
9983 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 case PyUnicode_2BYTE_KIND:
9985 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9986 case PyUnicode_4BYTE_KIND:
9987 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9988 }
9989 assert(0);
9990 return -1;
9991}
9992
9993static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9995 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996{
9997 switch(kind) {
9998 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10000 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10001 else
10002 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 case PyUnicode_2BYTE_KIND:
10004 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10005 case PyUnicode_4BYTE_KIND:
10006 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10007 }
10008 assert(0);
10009 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010010}
10011
Alexander Belopolsky40018472011-02-26 01:02:56 +000010012static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013replace(PyObject *self, PyObject *str1,
10014 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 PyObject *u;
10017 char *sbuf = PyUnicode_DATA(self);
10018 char *buf1 = PyUnicode_DATA(str1);
10019 char *buf2 = PyUnicode_DATA(str2);
10020 int srelease = 0, release1 = 0, release2 = 0;
10021 int skind = PyUnicode_KIND(self);
10022 int kind1 = PyUnicode_KIND(str1);
10023 int kind2 = PyUnicode_KIND(str2);
10024 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10025 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10026 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010027 int mayshrink;
10028 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029
10030 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010033 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
Victor Stinner59de0ee2011-10-07 10:01:28 +020010035 if (str1 == str2)
10036 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (skind < kind1)
10038 /* substring too wide to be present */
10039 goto nothing;
10040
Victor Stinner49a0a212011-10-12 23:46:10 +020010041 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10042 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10043 /* Replacing str1 with str2 may cause a maxchar reduction in the
10044 result string. */
10045 mayshrink = (maxchar_str2 < maxchar);
10046 maxchar = Py_MAX(maxchar, maxchar_str2);
10047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010049 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010050 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010052 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010055 Py_UCS4 u1, u2;
10056 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010058 if (findchar(sbuf, PyUnicode_KIND(self),
10059 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010063 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010065 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 rkind = PyUnicode_KIND(u);
10067 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10068 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010069 if (--maxcount < 0)
10070 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010073 }
10074 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 int rkind = skind;
10076 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (kind1 < rkind) {
10079 /* widen substring */
10080 buf1 = _PyUnicode_AsKind(str1, rkind);
10081 if (!buf1) goto error;
10082 release1 = 1;
10083 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010084 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010085 if (i < 0)
10086 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 if (rkind > kind2) {
10088 /* widen replacement */
10089 buf2 = _PyUnicode_AsKind(str2, rkind);
10090 if (!buf2) goto error;
10091 release2 = 1;
10092 }
10093 else if (rkind < kind2) {
10094 /* widen self and buf1 */
10095 rkind = kind2;
10096 if (release1) PyMem_Free(buf1);
10097 sbuf = _PyUnicode_AsKind(self, rkind);
10098 if (!sbuf) goto error;
10099 srelease = 1;
10100 buf1 = _PyUnicode_AsKind(str1, rkind);
10101 if (!buf1) goto error;
10102 release1 = 1;
10103 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010104 u = PyUnicode_New(slen, maxchar);
10105 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010107 assert(PyUnicode_KIND(u) == rkind);
10108 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010109
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010110 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010111 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010112 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010114 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010116
10117 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010118 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010119 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010120 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010121 if (i == -1)
10122 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010123 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010125 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 }
10130 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 Py_ssize_t n, i, j, ires;
10132 Py_ssize_t product, new_size;
10133 int rkind = skind;
10134 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010137 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 buf1 = _PyUnicode_AsKind(str1, rkind);
10139 if (!buf1) goto error;
10140 release1 = 1;
10141 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010143 if (n == 0)
10144 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010146 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 buf2 = _PyUnicode_AsKind(str2, rkind);
10148 if (!buf2) goto error;
10149 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 rkind = kind2;
10154 sbuf = _PyUnicode_AsKind(self, rkind);
10155 if (!sbuf) goto error;
10156 srelease = 1;
10157 if (release1) PyMem_Free(buf1);
10158 buf1 = _PyUnicode_AsKind(str1, rkind);
10159 if (!buf1) goto error;
10160 release1 = 1;
10161 }
10162 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10163 PyUnicode_GET_LENGTH(str1))); */
10164 product = n * (len2-len1);
10165 if ((product / (len2-len1)) != n) {
10166 PyErr_SetString(PyExc_OverflowError,
10167 "replace string is too long");
10168 goto error;
10169 }
10170 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010171 if (new_size == 0) {
10172 Py_INCREF(unicode_empty);
10173 u = unicode_empty;
10174 goto done;
10175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10177 PyErr_SetString(PyExc_OverflowError,
10178 "replace string is too long");
10179 goto error;
10180 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010181 u = PyUnicode_New(new_size, maxchar);
10182 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010184 assert(PyUnicode_KIND(u) == rkind);
10185 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 ires = i = 0;
10187 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010188 while (n-- > 0) {
10189 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010193 if (j == -1)
10194 break;
10195 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010196 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 memcpy(res + rkind * ires,
10198 sbuf + rkind * i,
10199 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010201 }
10202 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010204 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010205 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010206 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010213 memcpy(res + rkind * ires,
10214 sbuf + rkind * i,
10215 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 }
10217 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010218 /* interleave */
10219 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010220 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010222 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 if (--n <= 0)
10225 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 memcpy(res + rkind * ires,
10227 sbuf + rkind * i,
10228 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 ires++;
10230 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010231 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 memcpy(res + rkind * ires,
10233 sbuf + rkind * i,
10234 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010235 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 }
10237
10238 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010239 unicode_adjust_maxchar(&u);
10240 if (u == NULL)
10241 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010243
10244 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (srelease)
10246 PyMem_FREE(sbuf);
10247 if (release1)
10248 PyMem_FREE(buf1);
10249 if (release2)
10250 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010251 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010253
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (srelease)
10257 PyMem_FREE(sbuf);
10258 if (release1)
10259 PyMem_FREE(buf1);
10260 if (release2)
10261 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010262 if (PyUnicode_CheckExact(self)) {
10263 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010265 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010266 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 error:
10268 if (srelease && sbuf)
10269 PyMem_FREE(sbuf);
10270 if (release1 && buf1)
10271 PyMem_FREE(buf1);
10272 if (release2 && buf2)
10273 PyMem_FREE(buf2);
10274 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275}
10276
10277/* --- Unicode Object Methods --------------------------------------------- */
10278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010279PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281\n\
10282Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010283characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
10285static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010286unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288 return fixup(self, fixtitle);
10289}
10290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010291PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293\n\
10294Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010295have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296
10297static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010298unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 return fixup(self, fixcapitalize);
10301}
10302
10303#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010305 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306\n\
10307Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309
10310static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010311unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312{
10313 PyObject *list;
10314 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010315 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316
Guido van Rossumd57fd912000-03-10 22:53:23 +000010317 /* Split into words */
10318 list = split(self, NULL, -1);
10319 if (!list)
10320 return NULL;
10321
10322 /* Capitalize each word */
10323 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010324 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010325 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326 if (item == NULL)
10327 goto onError;
10328 Py_DECREF(PyList_GET_ITEM(list, i));
10329 PyList_SET_ITEM(list, i, item);
10330 }
10331
10332 /* Join the words to form a new string */
10333 item = PyUnicode_Join(NULL, list);
10334
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010336 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338}
10339#endif
10340
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010341/* Argument converter. Coerces to a single unicode character */
10342
10343static int
10344convert_uc(PyObject *obj, void *addr)
10345{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010347 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010348
Benjamin Peterson14339b62009-01-31 16:36:08 +000010349 uniobj = PyUnicode_FromObject(obj);
10350 if (uniobj == NULL) {
10351 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010353 return 0;
10354 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 Py_DECREF(uniobj);
10359 return 0;
10360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010362 Py_DECREF(uniobj);
10363 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010364}
10365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010366PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010369Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010370done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371
10372static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010373unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010375 Py_ssize_t marg, left;
10376 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 Py_UCS4 fillchar = ' ';
10378
Victor Stinnere9a29352011-10-01 02:14:59 +020010379 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381
Victor Stinnere9a29352011-10-01 02:14:59 +020010382 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383 return NULL;
10384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010387 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 }
10389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391 left = marg / 2 + (marg & width & 1);
10392
Victor Stinner9310abb2011-10-05 00:59:23 +020010393 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394}
10395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396/* This function assumes that str1 and str2 are readied by the caller. */
10397
Marc-André Lemburge5034372000-08-08 08:04:29 +000010398static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010399unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 int kind1, kind2;
10402 void *data1, *data2;
10403 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 kind1 = PyUnicode_KIND(str1);
10406 kind2 = PyUnicode_KIND(str2);
10407 data1 = PyUnicode_DATA(str1);
10408 data2 = PyUnicode_DATA(str2);
10409 len1 = PyUnicode_GET_LENGTH(str1);
10410 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 for (i = 0; i < len1 && i < len2; ++i) {
10413 Py_UCS4 c1, c2;
10414 c1 = PyUnicode_READ(kind1, data1, i);
10415 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010416
10417 if (c1 != c2)
10418 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010419 }
10420
10421 return (len1 < len2) ? -1 : (len1 != len2);
10422}
10423
Alexander Belopolsky40018472011-02-26 01:02:56 +000010424int
10425PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10428 if (PyUnicode_READY(left) == -1 ||
10429 PyUnicode_READY(right) == -1)
10430 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010431 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010433 PyErr_Format(PyExc_TypeError,
10434 "Can't compare %.100s and %.100s",
10435 left->ob_type->tp_name,
10436 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437 return -1;
10438}
10439
Martin v. Löwis5b222132007-06-10 09:51:05 +000010440int
10441PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10442{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 Py_ssize_t i;
10444 int kind;
10445 void *data;
10446 Py_UCS4 chr;
10447
Victor Stinner910337b2011-10-03 03:20:16 +020010448 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (PyUnicode_READY(uni) == -1)
10450 return -1;
10451 kind = PyUnicode_KIND(uni);
10452 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010453 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10455 if (chr != str[i])
10456 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010457 /* This check keeps Python strings that end in '\0' from comparing equal
10458 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010460 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010461 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010462 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010463 return 0;
10464}
10465
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010466
Benjamin Peterson29060642009-01-31 22:14:21 +000010467#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010468 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010469
Alexander Belopolsky40018472011-02-26 01:02:56 +000010470PyObject *
10471PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010472{
10473 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010475 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10476 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010477 if (PyUnicode_READY(left) == -1 ||
10478 PyUnicode_READY(right) == -1)
10479 return NULL;
10480 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10481 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010482 if (op == Py_EQ) {
10483 Py_INCREF(Py_False);
10484 return Py_False;
10485 }
10486 if (op == Py_NE) {
10487 Py_INCREF(Py_True);
10488 return Py_True;
10489 }
10490 }
10491 if (left == right)
10492 result = 0;
10493 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010494 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010495
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010496 /* Convert the return value to a Boolean */
10497 switch (op) {
10498 case Py_EQ:
10499 v = TEST_COND(result == 0);
10500 break;
10501 case Py_NE:
10502 v = TEST_COND(result != 0);
10503 break;
10504 case Py_LE:
10505 v = TEST_COND(result <= 0);
10506 break;
10507 case Py_GE:
10508 v = TEST_COND(result >= 0);
10509 break;
10510 case Py_LT:
10511 v = TEST_COND(result == -1);
10512 break;
10513 case Py_GT:
10514 v = TEST_COND(result == 1);
10515 break;
10516 default:
10517 PyErr_BadArgument();
10518 return NULL;
10519 }
10520 Py_INCREF(v);
10521 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010522 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010523
Brian Curtindfc80e32011-08-10 20:28:54 -050010524 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010525}
10526
Alexander Belopolsky40018472011-02-26 01:02:56 +000010527int
10528PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010529{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 int kind1, kind2, kind;
10532 void *buf1, *buf2;
10533 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010534 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010535
10536 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010537 sub = PyUnicode_FromObject(element);
10538 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010539 PyErr_Format(PyExc_TypeError,
10540 "'in <string>' requires string as left operand, not %s",
10541 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (PyUnicode_READY(sub) == -1)
10545 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010546
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010548 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010549 Py_DECREF(sub);
10550 return -1;
10551 }
10552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 kind1 = PyUnicode_KIND(str);
10554 kind2 = PyUnicode_KIND(sub);
10555 kind = kind1 > kind2 ? kind1 : kind2;
10556 buf1 = PyUnicode_DATA(str);
10557 buf2 = PyUnicode_DATA(sub);
10558 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010559 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 if (!buf1) {
10561 Py_DECREF(sub);
10562 return -1;
10563 }
10564 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010565 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 if (!buf2) {
10567 Py_DECREF(sub);
10568 if (kind1 != kind) PyMem_Free(buf1);
10569 return -1;
10570 }
10571 len1 = PyUnicode_GET_LENGTH(str);
10572 len2 = PyUnicode_GET_LENGTH(sub);
10573
10574 switch(kind) {
10575 case PyUnicode_1BYTE_KIND:
10576 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10577 break;
10578 case PyUnicode_2BYTE_KIND:
10579 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10580 break;
10581 case PyUnicode_4BYTE_KIND:
10582 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10583 break;
10584 default:
10585 result = -1;
10586 assert(0);
10587 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010588
10589 Py_DECREF(str);
10590 Py_DECREF(sub);
10591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 if (kind1 != kind)
10593 PyMem_Free(buf1);
10594 if (kind2 != kind)
10595 PyMem_Free(buf2);
10596
Guido van Rossum403d68b2000-03-13 15:55:09 +000010597 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010598}
10599
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600/* Concat to string or Unicode object giving a new Unicode object. */
10601
Alexander Belopolsky40018472011-02-26 01:02:56 +000010602PyObject *
10603PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010606 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
10608 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
10616 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010617 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
10625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010627 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10628 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 w = PyUnicode_New(
10632 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10633 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010634 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010635 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010636 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10637 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638 Py_DECREF(u);
10639 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010640 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644 Py_XDECREF(u);
10645 Py_XDECREF(v);
10646 return NULL;
10647}
10648
Victor Stinnerb0923652011-10-04 01:17:31 +020010649static void
10650unicode_append_inplace(PyObject **p_left, PyObject *right)
10651{
10652 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010653
10654 assert(PyUnicode_IS_READY(*p_left));
10655 assert(PyUnicode_IS_READY(right));
10656
10657 left_len = PyUnicode_GET_LENGTH(*p_left);
10658 right_len = PyUnicode_GET_LENGTH(right);
10659 if (left_len > PY_SSIZE_T_MAX - right_len) {
10660 PyErr_SetString(PyExc_OverflowError,
10661 "strings are too large to concat");
10662 goto error;
10663 }
10664 new_len = left_len + right_len;
10665
10666 /* Now we own the last reference to 'left', so we can resize it
10667 * in-place.
10668 */
10669 if (unicode_resize(p_left, new_len) != 0) {
10670 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10671 * deallocated so it cannot be put back into
10672 * 'variable'. The MemoryError is raised when there
10673 * is no value in 'variable', which might (very
10674 * remotely) be a cause of incompatibilities.
10675 */
10676 goto error;
10677 }
10678 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010679 copy_characters(*p_left, left_len, right, 0, right_len);
10680 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010681 return;
10682
10683error:
10684 Py_DECREF(*p_left);
10685 *p_left = NULL;
10686}
10687
Walter Dörwald1ab83302007-05-18 17:15:44 +000010688void
Victor Stinner23e56682011-10-03 03:54:37 +020010689PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010690{
Victor Stinner23e56682011-10-03 03:54:37 +020010691 PyObject *left, *res;
10692
10693 if (p_left == NULL) {
10694 if (!PyErr_Occurred())
10695 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010696 return;
10697 }
Victor Stinner23e56682011-10-03 03:54:37 +020010698 left = *p_left;
10699 if (right == NULL || !PyUnicode_Check(left)) {
10700 if (!PyErr_Occurred())
10701 PyErr_BadInternalCall();
10702 goto error;
10703 }
10704
Victor Stinnere1335c72011-10-04 20:53:03 +020010705 if (PyUnicode_READY(left))
10706 goto error;
10707 if (PyUnicode_READY(right))
10708 goto error;
10709
Victor Stinner23e56682011-10-03 03:54:37 +020010710 if (PyUnicode_CheckExact(left) && left != unicode_empty
10711 && PyUnicode_CheckExact(right) && right != unicode_empty
10712 && unicode_resizable(left)
10713 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10714 || _PyUnicode_WSTR(left) != NULL))
10715 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010716 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10717 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010718 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010719 not so different than duplicating the string. */
10720 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010721 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010722 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010723 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010724 return;
10725 }
10726 }
10727
10728 res = PyUnicode_Concat(left, right);
10729 if (res == NULL)
10730 goto error;
10731 Py_DECREF(left);
10732 *p_left = res;
10733 return;
10734
10735error:
10736 Py_DECREF(*p_left);
10737 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010738}
10739
10740void
10741PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10742{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010743 PyUnicode_Append(pleft, right);
10744 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010745}
10746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010747PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010750Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010751string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010755unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010757 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010758 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010759 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 int kind1, kind2, kind;
10762 void *buf1, *buf2;
10763 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
Jesus Ceaac451502011-04-20 17:09:23 +020010765 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10766 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 kind1 = PyUnicode_KIND(self);
10770 kind2 = PyUnicode_KIND(substring);
10771 kind = kind1 > kind2 ? kind1 : kind2;
10772 buf1 = PyUnicode_DATA(self);
10773 buf2 = PyUnicode_DATA(substring);
10774 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010775 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (!buf1) {
10777 Py_DECREF(substring);
10778 return NULL;
10779 }
10780 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010781 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (!buf2) {
10783 Py_DECREF(substring);
10784 if (kind1 != kind) PyMem_Free(buf1);
10785 return NULL;
10786 }
10787 len1 = PyUnicode_GET_LENGTH(self);
10788 len2 = PyUnicode_GET_LENGTH(substring);
10789
10790 ADJUST_INDICES(start, end, len1);
10791 switch(kind) {
10792 case PyUnicode_1BYTE_KIND:
10793 iresult = ucs1lib_count(
10794 ((Py_UCS1*)buf1) + start, end - start,
10795 buf2, len2, PY_SSIZE_T_MAX
10796 );
10797 break;
10798 case PyUnicode_2BYTE_KIND:
10799 iresult = ucs2lib_count(
10800 ((Py_UCS2*)buf1) + start, end - start,
10801 buf2, len2, PY_SSIZE_T_MAX
10802 );
10803 break;
10804 case PyUnicode_4BYTE_KIND:
10805 iresult = ucs4lib_count(
10806 ((Py_UCS4*)buf1) + start, end - start,
10807 buf2, len2, PY_SSIZE_T_MAX
10808 );
10809 break;
10810 default:
10811 assert(0); iresult = 0;
10812 }
10813
10814 result = PyLong_FromSsize_t(iresult);
10815
10816 if (kind1 != kind)
10817 PyMem_Free(buf1);
10818 if (kind2 != kind)
10819 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010822
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823 return result;
10824}
10825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010826PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010827 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010829Encode S using the codec registered for encoding. Default encoding\n\
10830is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010831handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010832a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10833'xmlcharrefreplace' as well as any other name registered with\n\
10834codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010837unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010839 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 char *encoding = NULL;
10841 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010842
Benjamin Peterson308d6372009-09-18 21:42:35 +000010843 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10844 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010846 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010847}
10848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010849PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010850 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851\n\
10852Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010853If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854
10855static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010856unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010858 Py_ssize_t i, j, line_pos, src_len, incr;
10859 Py_UCS4 ch;
10860 PyObject *u;
10861 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010863 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010864 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Antoine Pitrou22425222011-10-04 19:10:51 +020010869 if (PyUnicode_READY(self) == -1)
10870 return NULL;
10871
Thomas Wouters7e474022000-07-16 12:04:32 +000010872 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 src_len = PyUnicode_GET_LENGTH(self);
10874 i = j = line_pos = 0;
10875 kind = PyUnicode_KIND(self);
10876 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010877 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 for (; i < src_len; i++) {
10879 ch = PyUnicode_READ(kind, src_data, i);
10880 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010881 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010883 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 goto overflow;
10886 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010887 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010888 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010891 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010892 goto overflow;
10893 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 if (ch == '\n' || ch == '\r')
10896 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010899 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010900 Py_INCREF(self);
10901 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010902 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010903
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906 if (!u)
10907 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010908 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909
Antoine Pitroue71d5742011-10-04 15:55:09 +020010910 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 for (; i < src_len; i++) {
10913 ch = PyUnicode_READ(kind, src_data, i);
10914 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 incr = tabsize - (line_pos % tabsize);
10917 line_pos += incr;
10918 while (incr--) {
10919 PyUnicode_WRITE(kind, dest_data, j, ' ');
10920 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010921 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010922 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010923 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010924 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010925 line_pos++;
10926 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010927 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 if (ch == '\n' || ch == '\r')
10929 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010931 }
10932 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010933 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010934
Antoine Pitroue71d5742011-10-04 15:55:09 +020010935 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010936 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
10939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010940PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942\n\
10943Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010944such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945arguments start and end are interpreted as in slice notation.\n\
10946\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010947Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
10949static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010952 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010953 Py_ssize_t start;
10954 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010955 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
Jesus Ceaac451502011-04-20 17:09:23 +020010957 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10958 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (PyUnicode_READY(self) == -1)
10962 return NULL;
10963 if (PyUnicode_READY(substring) == -1)
10964 return NULL;
10965
Victor Stinner7931d9a2011-11-04 00:22:48 +010010966 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967
10968 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (result == -2)
10971 return NULL;
10972
Christian Heimes217cfd12007-12-02 14:31:20 +000010973 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974}
10975
10976static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010977unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010979 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10980 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983}
10984
Guido van Rossumc2504932007-09-18 19:42:40 +000010985/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010986 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010987static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010988unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
Guido van Rossumc2504932007-09-18 19:42:40 +000010990 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010991 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (_PyUnicode_HASH(self) != -1)
10994 return _PyUnicode_HASH(self);
10995 if (PyUnicode_READY(self) == -1)
10996 return -1;
10997 len = PyUnicode_GET_LENGTH(self);
10998
10999 /* The hash function as a macro, gets expanded three times below. */
11000#define HASH(P) \
11001 x = (Py_uhash_t)*P << 7; \
11002 while (--len >= 0) \
11003 x = (1000003*x) ^ (Py_uhash_t)*P++;
11004
11005 switch (PyUnicode_KIND(self)) {
11006 case PyUnicode_1BYTE_KIND: {
11007 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11008 HASH(c);
11009 break;
11010 }
11011 case PyUnicode_2BYTE_KIND: {
11012 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11013 HASH(s);
11014 break;
11015 }
11016 default: {
11017 Py_UCS4 *l;
11018 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11019 "Impossible switch case in unicode_hash");
11020 l = PyUnicode_4BYTE_DATA(self);
11021 HASH(l);
11022 break;
11023 }
11024 }
11025 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11026
Guido van Rossumc2504932007-09-18 19:42:40 +000011027 if (x == -1)
11028 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011030 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011034PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011037Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038
11039static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011042 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011043 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011044 Py_ssize_t start;
11045 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
Jesus Ceaac451502011-04-20 17:09:23 +020011047 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11048 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 if (PyUnicode_READY(self) == -1)
11052 return NULL;
11053 if (PyUnicode_READY(substring) == -1)
11054 return NULL;
11055
Victor Stinner7931d9a2011-11-04 00:22:48 +010011056 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057
11058 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 if (result == -2)
11061 return NULL;
11062
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 if (result < 0) {
11064 PyErr_SetString(PyExc_ValueError, "substring not found");
11065 return NULL;
11066 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011067
Christian Heimes217cfd12007-12-02 14:31:20 +000011068 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069}
11070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011071PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011072 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011074Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011075at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076
11077static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011078unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 Py_ssize_t i, length;
11081 int kind;
11082 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083 int cased;
11084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (PyUnicode_READY(self) == -1)
11086 return NULL;
11087 length = PyUnicode_GET_LENGTH(self);
11088 kind = PyUnicode_KIND(self);
11089 data = PyUnicode_DATA(self);
11090
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (length == 1)
11093 return PyBool_FromLong(
11094 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011096 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011099
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 for (i = 0; i < length; i++) {
11102 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011103
Benjamin Peterson29060642009-01-31 22:14:21 +000011104 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11105 return PyBool_FromLong(0);
11106 else if (!cased && Py_UNICODE_ISLOWER(ch))
11107 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011109 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110}
11111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011112PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011113 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011115Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117
11118static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011119unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 Py_ssize_t i, length;
11122 int kind;
11123 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 int cased;
11125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 if (PyUnicode_READY(self) == -1)
11127 return NULL;
11128 length = PyUnicode_GET_LENGTH(self);
11129 kind = PyUnicode_KIND(self);
11130 data = PyUnicode_DATA(self);
11131
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 if (length == 1)
11134 return PyBool_FromLong(
11135 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 for (i = 0; i < length; i++) {
11143 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011144
Benjamin Peterson29060642009-01-31 22:14:21 +000011145 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11146 return PyBool_FromLong(0);
11147 else if (!cased && Py_UNICODE_ISUPPER(ch))
11148 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011150 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151}
11152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011153PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011156Return True if S is a titlecased string and there is at least one\n\
11157character in S, i.e. upper- and titlecase characters may only\n\
11158follow uncased characters and lowercase characters only cased ones.\n\
11159Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160
11161static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011162unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 Py_ssize_t i, length;
11165 int kind;
11166 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 int cased, previous_is_cased;
11168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (PyUnicode_READY(self) == -1)
11170 return NULL;
11171 length = PyUnicode_GET_LENGTH(self);
11172 kind = PyUnicode_KIND(self);
11173 data = PyUnicode_DATA(self);
11174
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 if (length == 1) {
11177 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11178 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11179 (Py_UNICODE_ISUPPER(ch) != 0));
11180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011182 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011185
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186 cased = 0;
11187 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 for (i = 0; i < length; i++) {
11189 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011190
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11192 if (previous_is_cased)
11193 return PyBool_FromLong(0);
11194 previous_is_cased = 1;
11195 cased = 1;
11196 }
11197 else if (Py_UNICODE_ISLOWER(ch)) {
11198 if (!previous_is_cased)
11199 return PyBool_FromLong(0);
11200 previous_is_cased = 1;
11201 cased = 1;
11202 }
11203 else
11204 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011206 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207}
11208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011212Return True if all characters in S are whitespace\n\
11213and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
11215static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011216unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 Py_ssize_t i, length;
11219 int kind;
11220 void *data;
11221
11222 if (PyUnicode_READY(self) == -1)
11223 return NULL;
11224 length = PyUnicode_GET_LENGTH(self);
11225 kind = PyUnicode_KIND(self);
11226 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 if (length == 1)
11230 return PyBool_FromLong(
11231 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011233 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 for (i = 0; i < length; i++) {
11238 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011239 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011242 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243}
11244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011247\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011248Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011250
11251static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011252unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 Py_ssize_t i, length;
11255 int kind;
11256 void *data;
11257
11258 if (PyUnicode_READY(self) == -1)
11259 return NULL;
11260 length = PyUnicode_GET_LENGTH(self);
11261 kind = PyUnicode_KIND(self);
11262 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011263
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011264 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (length == 1)
11266 return PyBool_FromLong(
11267 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011268
11269 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 for (i = 0; i < length; i++) {
11274 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011277 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011278}
11279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011281 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011282\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011283Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011284and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011285
11286static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011287unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 int kind;
11290 void *data;
11291 Py_ssize_t len, i;
11292
11293 if (PyUnicode_READY(self) == -1)
11294 return NULL;
11295
11296 kind = PyUnicode_KIND(self);
11297 data = PyUnicode_DATA(self);
11298 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011300 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (len == 1) {
11302 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11303 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11304 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011305
11306 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 for (i = 0; i < len; i++) {
11311 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011312 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011313 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011314 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011315 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316}
11317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011318PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011321Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
11324static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011325unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 Py_ssize_t i, length;
11328 int kind;
11329 void *data;
11330
11331 if (PyUnicode_READY(self) == -1)
11332 return NULL;
11333 length = PyUnicode_GET_LENGTH(self);
11334 kind = PyUnicode_KIND(self);
11335 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 1)
11339 return PyBool_FromLong(
11340 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011342 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 for (i = 0; i < length; i++) {
11347 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011348 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011350 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351}
11352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011356Return True if all characters in S are digits\n\
11357and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
11359static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011360unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362 Py_ssize_t i, length;
11363 int kind;
11364 void *data;
11365
11366 if (PyUnicode_READY(self) == -1)
11367 return NULL;
11368 length = PyUnicode_GET_LENGTH(self);
11369 kind = PyUnicode_KIND(self);
11370 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (length == 1) {
11374 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11375 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 for (i = 0; i < length; i++) {
11383 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011386 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387}
11388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011389PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011392Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394
11395static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011396unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 Py_ssize_t i, length;
11399 int kind;
11400 void *data;
11401
11402 if (PyUnicode_READY(self) == -1)
11403 return NULL;
11404 length = PyUnicode_GET_LENGTH(self);
11405 kind = PyUnicode_KIND(self);
11406 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (length == 1)
11410 return PyBool_FromLong(
11411 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011413 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 for (i = 0; i < length; i++) {
11418 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011421 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422}
11423
Martin v. Löwis47383402007-08-15 07:32:56 +000011424int
11425PyUnicode_IsIdentifier(PyObject *self)
11426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 int kind;
11428 void *data;
11429 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011430 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (PyUnicode_READY(self) == -1) {
11433 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 }
11436
11437 /* Special case for empty strings */
11438 if (PyUnicode_GET_LENGTH(self) == 0)
11439 return 0;
11440 kind = PyUnicode_KIND(self);
11441 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011442
11443 /* PEP 3131 says that the first character must be in
11444 XID_Start and subsequent characters in XID_Continue,
11445 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011446 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011447 letters, digits, underscore). However, given the current
11448 definition of XID_Start and XID_Continue, it is sufficient
11449 to check just for these, except that _ must be allowed
11450 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011452 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011453 return 0;
11454
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011455 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011458 return 1;
11459}
11460
11461PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011462 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011463\n\
11464Return True if S is a valid identifier according\n\
11465to the language definition.");
11466
11467static PyObject*
11468unicode_isidentifier(PyObject *self)
11469{
11470 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11471}
11472
Georg Brandl559e5d72008-06-11 18:37:52 +000011473PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011475\n\
11476Return True if all characters in S are considered\n\
11477printable in repr() or S is empty, False otherwise.");
11478
11479static PyObject*
11480unicode_isprintable(PyObject *self)
11481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 Py_ssize_t i, length;
11483 int kind;
11484 void *data;
11485
11486 if (PyUnicode_READY(self) == -1)
11487 return NULL;
11488 length = PyUnicode_GET_LENGTH(self);
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011491
11492 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (length == 1)
11494 return PyBool_FromLong(
11495 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 for (i = 0; i < length; i++) {
11498 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011499 Py_RETURN_FALSE;
11500 }
11501 }
11502 Py_RETURN_TRUE;
11503}
11504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011506 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507\n\
11508Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011509iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011512unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011514 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515}
11516
Martin v. Löwis18e16552006-02-15 17:27:45 +000011517static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011518unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (PyUnicode_READY(self) == -1)
11521 return -1;
11522 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011528Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011529done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
11531static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011532unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011534 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 Py_UCS4 fillchar = ' ';
11536
11537 if (PyUnicode_READY(self) == -1)
11538 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011539
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011540 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 return NULL;
11542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011545 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546 }
11547
Victor Stinner7931d9a2011-11-04 00:22:48 +010011548 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549}
11550
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011551PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011554Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
11556static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011557unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 return fixup(self, fixlower);
11560}
11561
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011562#define LEFTSTRIP 0
11563#define RIGHTSTRIP 1
11564#define BOTHSTRIP 2
11565
11566/* Arrays indexed by above */
11567static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11568
11569#define STRIPNAME(i) (stripformat[i]+3)
11570
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011571/* externally visible for str.strip(unicode) */
11572PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011573_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011574{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 void *data;
11576 int kind;
11577 Py_ssize_t i, j, len;
11578 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11581 return NULL;
11582
11583 kind = PyUnicode_KIND(self);
11584 data = PyUnicode_DATA(self);
11585 len = PyUnicode_GET_LENGTH(self);
11586 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11587 PyUnicode_DATA(sepobj),
11588 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011589
Benjamin Peterson14339b62009-01-31 16:36:08 +000011590 i = 0;
11591 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011592 while (i < len &&
11593 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 i++;
11595 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597
Benjamin Peterson14339b62009-01-31 16:36:08 +000011598 j = len;
11599 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011600 do {
11601 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 } while (j >= i &&
11603 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011604 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011605 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606
Victor Stinner7931d9a2011-11-04 00:22:48 +010011607 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608}
11609
11610PyObject*
11611PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11612{
11613 unsigned char *data;
11614 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011615 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616
Victor Stinnerde636f32011-10-01 03:55:54 +020011617 if (PyUnicode_READY(self) == -1)
11618 return NULL;
11619
11620 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11621
Victor Stinner12bab6d2011-10-01 01:53:49 +020011622 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011624 if (PyUnicode_CheckExact(self)) {
11625 Py_INCREF(self);
11626 return self;
11627 }
11628 else
11629 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 }
11631
Victor Stinner12bab6d2011-10-01 01:53:49 +020011632 length = end - start;
11633 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011634 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635
Victor Stinnerde636f32011-10-01 03:55:54 +020011636 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011637 PyErr_SetString(PyExc_IndexError, "string index out of range");
11638 return NULL;
11639 }
11640
Victor Stinnerb9275c12011-10-05 14:01:42 +020011641 if (PyUnicode_IS_ASCII(self)) {
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_1BYTE_DATA(self);
11644 return unicode_fromascii(data + start, length);
11645 }
11646 else {
11647 kind = PyUnicode_KIND(self);
11648 data = PyUnicode_1BYTE_DATA(self);
11649 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011650 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011651 length);
11652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011654
11655static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011656do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 int kind;
11659 void *data;
11660 Py_ssize_t len, i, j;
11661
11662 if (PyUnicode_READY(self) == -1)
11663 return NULL;
11664
11665 kind = PyUnicode_KIND(self);
11666 data = PyUnicode_DATA(self);
11667 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668
Benjamin Peterson14339b62009-01-31 16:36:08 +000011669 i = 0;
11670 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011672 i++;
11673 }
11674 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675
Benjamin Peterson14339b62009-01-31 16:36:08 +000011676 j = len;
11677 if (striptype != LEFTSTRIP) {
11678 do {
11679 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011681 j++;
11682 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683
Victor Stinner7931d9a2011-11-04 00:22:48 +010011684 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685}
11686
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011687
11688static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011689do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011691 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11694 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011695
Benjamin Peterson14339b62009-01-31 16:36:08 +000011696 if (sep != NULL && sep != Py_None) {
11697 if (PyUnicode_Check(sep))
11698 return _PyUnicode_XStrip(self, striptype, sep);
11699 else {
11700 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 "%s arg must be None or str",
11702 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011703 return NULL;
11704 }
11705 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011706
Benjamin Peterson14339b62009-01-31 16:36:08 +000011707 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708}
11709
11710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011711PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713\n\
11714Return a copy of the string S with leading and trailing\n\
11715whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011716If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
11718static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011719unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 if (PyTuple_GET_SIZE(args) == 0)
11722 return do_strip(self, BOTHSTRIP); /* Common case */
11723 else
11724 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725}
11726
11727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011729 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011730\n\
11731Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011732If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011733
11734static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011735unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011736{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011737 if (PyTuple_GET_SIZE(args) == 0)
11738 return do_strip(self, LEFTSTRIP); /* Common case */
11739 else
11740 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741}
11742
11743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011744PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011745 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746\n\
11747Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011748If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011749
11750static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011751unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011752{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011753 if (PyTuple_GET_SIZE(args) == 0)
11754 return do_strip(self, RIGHTSTRIP); /* Common case */
11755 else
11756 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757}
11758
11759
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011761unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Georg Brandl222de0f2009-04-12 12:01:50 +000011766 if (len < 1) {
11767 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011768 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011769 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Tim Peters7a29bd52001-09-12 03:03:31 +000011771 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 /* no repeat, return original string */
11773 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011774 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 }
Tim Peters8f422462000-09-09 06:13:41 +000011776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (PyUnicode_READY(str) == -1)
11778 return NULL;
11779
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011780 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011781 PyErr_SetString(PyExc_OverflowError,
11782 "repeated string is too long");
11783 return NULL;
11784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011786
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011787 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 if (!u)
11789 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011790 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (PyUnicode_GET_LENGTH(str) == 1) {
11793 const int kind = PyUnicode_KIND(str);
11794 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11795 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011796 if (kind == PyUnicode_1BYTE_KIND)
11797 memset(to, (unsigned char)fill_char, len);
11798 else {
11799 for (n = 0; n < len; ++n)
11800 PyUnicode_WRITE(kind, to, n, fill_char);
11801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 }
11803 else {
11804 /* number of characters copied this far */
11805 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011806 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 char *to = (char *) PyUnicode_DATA(u);
11808 Py_MEMCPY(to, PyUnicode_DATA(str),
11809 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 n = (done <= nchars-done) ? done : nchars-done;
11812 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011813 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 }
11816
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011817 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011818 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819}
11820
Alexander Belopolsky40018472011-02-26 01:02:56 +000011821PyObject *
11822PyUnicode_Replace(PyObject *obj,
11823 PyObject *subobj,
11824 PyObject *replobj,
11825 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826{
11827 PyObject *self;
11828 PyObject *str1;
11829 PyObject *str2;
11830 PyObject *result;
11831
11832 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011833 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011836 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 Py_DECREF(self);
11838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 }
11840 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011841 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 Py_DECREF(self);
11843 Py_DECREF(str1);
11844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 Py_DECREF(self);
11848 Py_DECREF(str1);
11849 Py_DECREF(str2);
11850 return result;
11851}
11852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011853PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011854 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855\n\
11856Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011857old replaced by new. If the optional argument count is\n\
11858given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
11860static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 PyObject *str1;
11864 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011865 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 PyObject *result;
11867
Martin v. Löwis18e16552006-02-15 17:27:45 +000011868 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 str1 = PyUnicode_FromObject(str1);
11873 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11874 return NULL;
11875 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011876 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 Py_DECREF(str1);
11878 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880
11881 result = replace(self, str1, str2, maxcount);
11882
11883 Py_DECREF(str1);
11884 Py_DECREF(str2);
11885 return result;
11886}
11887
Alexander Belopolsky40018472011-02-26 01:02:56 +000011888static PyObject *
11889unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011891 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 Py_ssize_t isize;
11893 Py_ssize_t osize, squote, dquote, i, o;
11894 Py_UCS4 max, quote;
11895 int ikind, okind;
11896 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011899 return NULL;
11900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 isize = PyUnicode_GET_LENGTH(unicode);
11902 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 /* Compute length of output, quote characters, and
11905 maximum character */
11906 osize = 2; /* quotes */
11907 max = 127;
11908 squote = dquote = 0;
11909 ikind = PyUnicode_KIND(unicode);
11910 for (i = 0; i < isize; i++) {
11911 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11912 switch (ch) {
11913 case '\'': squote++; osize++; break;
11914 case '"': dquote++; osize++; break;
11915 case '\\': case '\t': case '\r': case '\n':
11916 osize += 2; break;
11917 default:
11918 /* Fast-path ASCII */
11919 if (ch < ' ' || ch == 0x7f)
11920 osize += 4; /* \xHH */
11921 else if (ch < 0x7f)
11922 osize++;
11923 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11924 osize++;
11925 max = ch > max ? ch : max;
11926 }
11927 else if (ch < 0x100)
11928 osize += 4; /* \xHH */
11929 else if (ch < 0x10000)
11930 osize += 6; /* \uHHHH */
11931 else
11932 osize += 10; /* \uHHHHHHHH */
11933 }
11934 }
11935
11936 quote = '\'';
11937 if (squote) {
11938 if (dquote)
11939 /* Both squote and dquote present. Use squote,
11940 and escape them */
11941 osize += squote;
11942 else
11943 quote = '"';
11944 }
11945
11946 repr = PyUnicode_New(osize, max);
11947 if (repr == NULL)
11948 return NULL;
11949 okind = PyUnicode_KIND(repr);
11950 odata = PyUnicode_DATA(repr);
11951
11952 PyUnicode_WRITE(okind, odata, 0, quote);
11953 PyUnicode_WRITE(okind, odata, osize-1, quote);
11954
11955 for (i = 0, o = 1; i < isize; i++) {
11956 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011957
11958 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if ((ch == quote) || (ch == '\\')) {
11960 PyUnicode_WRITE(okind, odata, o++, '\\');
11961 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011962 continue;
11963 }
11964
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011966 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 PyUnicode_WRITE(okind, odata, o++, '\\');
11968 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011969 }
11970 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 PyUnicode_WRITE(okind, odata, o++, '\\');
11972 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011973 }
11974 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 PyUnicode_WRITE(okind, odata, o++, '\\');
11976 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011977 }
11978
11979 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011980 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 PyUnicode_WRITE(okind, odata, o++, '\\');
11982 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011983 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11984 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011985 }
11986
Georg Brandl559e5d72008-06-11 18:37:52 +000011987 /* Copy ASCII characters as-is */
11988 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011990 }
11991
Benjamin Peterson29060642009-01-31 22:14:21 +000011992 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011993 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011995 (categories Z* and C* except ASCII space)
11996 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011998 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (ch <= 0xff) {
12000 PyUnicode_WRITE(okind, odata, o++, '\\');
12001 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012002 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12003 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012004 }
12005 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 else if (ch >= 0x10000) {
12007 PyUnicode_WRITE(okind, odata, o++, '\\');
12008 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012009 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12010 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12011 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12012 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12013 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12014 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12015 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012017 }
12018 /* Map 16-bit characters to '\uxxxx' */
12019 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 PyUnicode_WRITE(okind, odata, o++, '\\');
12021 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012026 }
12027 }
12028 /* Copy characters as-is */
12029 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012031 }
12032 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012035 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012036 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037}
12038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012039PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012040 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041\n\
12042Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012043such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044arguments start and end are interpreted as in slice notation.\n\
12045\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012046Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
12048static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012051 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012052 Py_ssize_t start;
12053 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012054 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
Jesus Ceaac451502011-04-20 17:09:23 +020012056 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12057 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062 if (PyUnicode_READY(substring) == -1)
12063 return NULL;
12064
Victor Stinner7931d9a2011-11-04 00:22:48 +010012065 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066
12067 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (result == -2)
12070 return NULL;
12071
Christian Heimes217cfd12007-12-02 14:31:20 +000012072 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073}
12074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012078Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
12080static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012083 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012084 Py_ssize_t start;
12085 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012086 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
Jesus Ceaac451502011-04-20 17:09:23 +020012088 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12089 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012090 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 if (PyUnicode_READY(self) == -1)
12093 return NULL;
12094 if (PyUnicode_READY(substring) == -1)
12095 return NULL;
12096
Victor Stinner7931d9a2011-11-04 00:22:48 +010012097 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if (result == -2)
12102 return NULL;
12103
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104 if (result < 0) {
12105 PyErr_SetString(PyExc_ValueError, "substring not found");
12106 return NULL;
12107 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108
Christian Heimes217cfd12007-12-02 14:31:20 +000012109 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110}
12111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012112PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012115Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012116done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
12118static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012119unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012121 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 Py_UCS4 fillchar = ' ';
12123
Victor Stinnere9a29352011-10-01 02:14:59 +020012124 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012126
Victor Stinnere9a29352011-10-01 02:14:59 +020012127 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 return NULL;
12129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012132 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 }
12134
Victor Stinner7931d9a2011-11-04 00:22:48 +010012135 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012136}
12137
Alexander Belopolsky40018472011-02-26 01:02:56 +000012138PyObject *
12139PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140{
12141 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012142
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 s = PyUnicode_FromObject(s);
12144 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012145 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 if (sep != NULL) {
12147 sep = PyUnicode_FromObject(sep);
12148 if (sep == NULL) {
12149 Py_DECREF(s);
12150 return NULL;
12151 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152 }
12153
Victor Stinner9310abb2011-10-05 00:59:23 +020012154 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155
12156 Py_DECREF(s);
12157 Py_XDECREF(sep);
12158 return result;
12159}
12160
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012161PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163\n\
12164Return a list of the words in S, using sep as the\n\
12165delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012166splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012167whitespace string is a separator and empty strings are\n\
12168removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169
12170static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012171unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
12173 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012174 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175
Martin v. Löwis18e16552006-02-15 17:27:45 +000012176 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 return NULL;
12178
12179 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012182 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012184 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185}
12186
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187PyObject *
12188PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12189{
12190 PyObject* str_obj;
12191 PyObject* sep_obj;
12192 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 int kind1, kind2, kind;
12194 void *buf1 = NULL, *buf2 = NULL;
12195 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012196
12197 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012198 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012199 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012200 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012202 Py_DECREF(str_obj);
12203 return NULL;
12204 }
12205
Victor Stinner14f8f022011-10-05 20:58:25 +020012206 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012208 kind = Py_MAX(kind1, kind2);
12209 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012211 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 if (!buf1)
12213 goto onError;
12214 buf2 = PyUnicode_DATA(sep_obj);
12215 if (kind2 != kind)
12216 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12217 if (!buf2)
12218 goto onError;
12219 len1 = PyUnicode_GET_LENGTH(str_obj);
12220 len2 = PyUnicode_GET_LENGTH(sep_obj);
12221
Victor Stinner14f8f022011-10-05 20:58:25 +020012222 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012224 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12225 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12226 else
12227 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 break;
12229 case PyUnicode_2BYTE_KIND:
12230 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12231 break;
12232 case PyUnicode_4BYTE_KIND:
12233 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12234 break;
12235 default:
12236 assert(0);
12237 out = 0;
12238 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012239
12240 Py_DECREF(sep_obj);
12241 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (kind1 != kind)
12243 PyMem_Free(buf1);
12244 if (kind2 != kind)
12245 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246
12247 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 onError:
12249 Py_DECREF(sep_obj);
12250 Py_DECREF(str_obj);
12251 if (kind1 != kind && buf1)
12252 PyMem_Free(buf1);
12253 if (kind2 != kind && buf2)
12254 PyMem_Free(buf2);
12255 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256}
12257
12258
12259PyObject *
12260PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12261{
12262 PyObject* str_obj;
12263 PyObject* sep_obj;
12264 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 int kind1, kind2, kind;
12266 void *buf1 = NULL, *buf2 = NULL;
12267 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268
12269 str_obj = PyUnicode_FromObject(str_in);
12270 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272 sep_obj = PyUnicode_FromObject(sep_in);
12273 if (!sep_obj) {
12274 Py_DECREF(str_obj);
12275 return NULL;
12276 }
12277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 kind1 = PyUnicode_KIND(str_in);
12279 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012280 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 buf1 = PyUnicode_DATA(str_in);
12282 if (kind1 != kind)
12283 buf1 = _PyUnicode_AsKind(str_in, kind);
12284 if (!buf1)
12285 goto onError;
12286 buf2 = PyUnicode_DATA(sep_obj);
12287 if (kind2 != kind)
12288 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12289 if (!buf2)
12290 goto onError;
12291 len1 = PyUnicode_GET_LENGTH(str_obj);
12292 len2 = PyUnicode_GET_LENGTH(sep_obj);
12293
12294 switch(PyUnicode_KIND(str_in)) {
12295 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012296 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12297 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12298 else
12299 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 break;
12301 case PyUnicode_2BYTE_KIND:
12302 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12303 break;
12304 case PyUnicode_4BYTE_KIND:
12305 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12306 break;
12307 default:
12308 assert(0);
12309 out = 0;
12310 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311
12312 Py_DECREF(sep_obj);
12313 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (kind1 != kind)
12315 PyMem_Free(buf1);
12316 if (kind2 != kind)
12317 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318
12319 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 onError:
12321 Py_DECREF(sep_obj);
12322 Py_DECREF(str_obj);
12323 if (kind1 != kind && buf1)
12324 PyMem_Free(buf1);
12325 if (kind2 != kind && buf2)
12326 PyMem_Free(buf2);
12327 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012328}
12329
12330PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012332\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012333Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012334the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012335found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012336
12337static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012338unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012339{
Victor Stinner9310abb2011-10-05 00:59:23 +020012340 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341}
12342
12343PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012344 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012346Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012347the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012348separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012349
12350static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012351unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352{
Victor Stinner9310abb2011-10-05 00:59:23 +020012353 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354}
12355
Alexander Belopolsky40018472011-02-26 01:02:56 +000012356PyObject *
12357PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012358{
12359 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012360
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012361 s = PyUnicode_FromObject(s);
12362 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012363 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 if (sep != NULL) {
12365 sep = PyUnicode_FromObject(sep);
12366 if (sep == NULL) {
12367 Py_DECREF(s);
12368 return NULL;
12369 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012370 }
12371
Victor Stinner9310abb2011-10-05 00:59:23 +020012372 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012373
12374 Py_DECREF(s);
12375 Py_XDECREF(sep);
12376 return result;
12377}
12378
12379PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012380 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012381\n\
12382Return a list of the words in S, using sep as the\n\
12383delimiter string, starting at the end of the string and\n\
12384working to the front. If maxsplit is given, at most maxsplit\n\
12385splits are done. If sep is not specified, any whitespace string\n\
12386is a separator.");
12387
12388static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012389unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012390{
12391 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012392 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012393
Martin v. Löwis18e16552006-02-15 17:27:45 +000012394 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012395 return NULL;
12396
12397 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012400 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012401 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012402 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012403}
12404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012405PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012406 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407\n\
12408Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012409Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012410is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411
12412static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012413unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012415 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012416 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012417
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012418 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12419 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 return NULL;
12421
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012422 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012423}
12424
12425static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012426PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012427{
Walter Dörwald346737f2007-05-31 10:44:43 +000012428 if (PyUnicode_CheckExact(self)) {
12429 Py_INCREF(self);
12430 return self;
12431 } else
12432 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012433 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012434}
12435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012436PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438\n\
12439Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012440and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012441
12442static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012443unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445 return fixup(self, fixswapcase);
12446}
12447
Georg Brandlceee0772007-11-27 23:48:05 +000012448PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012450\n\
12451Return a translation table usable for str.translate().\n\
12452If there is only one argument, it must be a dictionary mapping Unicode\n\
12453ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012454Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012455If there are two arguments, they must be strings of equal length, and\n\
12456in the resulting dictionary, each character in x will be mapped to the\n\
12457character at the same position in y. If there is a third argument, it\n\
12458must be a string, whose characters will be mapped to None in the result.");
12459
12460static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012461unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012462{
12463 PyObject *x, *y = NULL, *z = NULL;
12464 PyObject *new = NULL, *key, *value;
12465 Py_ssize_t i = 0;
12466 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467
Georg Brandlceee0772007-11-27 23:48:05 +000012468 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12469 return NULL;
12470 new = PyDict_New();
12471 if (!new)
12472 return NULL;
12473 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 int x_kind, y_kind, z_kind;
12475 void *x_data, *y_data, *z_data;
12476
Georg Brandlceee0772007-11-27 23:48:05 +000012477 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012478 if (!PyUnicode_Check(x)) {
12479 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12480 "be a string if there is a second argument");
12481 goto err;
12482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012484 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12485 "arguments must have equal length");
12486 goto err;
12487 }
12488 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 x_kind = PyUnicode_KIND(x);
12490 y_kind = PyUnicode_KIND(y);
12491 x_data = PyUnicode_DATA(x);
12492 y_data = PyUnicode_DATA(y);
12493 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12494 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12495 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012496 if (!key || !value)
12497 goto err;
12498 res = PyDict_SetItem(new, key, value);
12499 Py_DECREF(key);
12500 Py_DECREF(value);
12501 if (res < 0)
12502 goto err;
12503 }
12504 /* create entries for deleting chars in z */
12505 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 z_kind = PyUnicode_KIND(z);
12507 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012508 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012510 if (!key)
12511 goto err;
12512 res = PyDict_SetItem(new, key, Py_None);
12513 Py_DECREF(key);
12514 if (res < 0)
12515 goto err;
12516 }
12517 }
12518 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 int kind;
12520 void *data;
12521
Georg Brandlceee0772007-11-27 23:48:05 +000012522 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012523 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012524 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12525 "to maketrans it must be a dict");
12526 goto err;
12527 }
12528 /* copy entries into the new dict, converting string keys to int keys */
12529 while (PyDict_Next(x, &i, &key, &value)) {
12530 if (PyUnicode_Check(key)) {
12531 /* convert string keys to integer keys */
12532 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012533 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012534 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12535 "table must be of length 1");
12536 goto err;
12537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 kind = PyUnicode_KIND(key);
12539 data = PyUnicode_DATA(key);
12540 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012541 if (!newkey)
12542 goto err;
12543 res = PyDict_SetItem(new, newkey, value);
12544 Py_DECREF(newkey);
12545 if (res < 0)
12546 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012547 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012548 /* just keep integer keys */
12549 if (PyDict_SetItem(new, key, value) < 0)
12550 goto err;
12551 } else {
12552 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12553 "be strings or integers");
12554 goto err;
12555 }
12556 }
12557 }
12558 return new;
12559 err:
12560 Py_DECREF(new);
12561 return NULL;
12562}
12563
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012564PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566\n\
12567Return a copy of the string S, where all characters have been mapped\n\
12568through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012569Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012570Unmapped characters are left untouched. Characters mapped to None\n\
12571are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
12573static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012579PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012582Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
12584static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012585unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return fixup(self, fixupper);
12588}
12589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012590PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012593Pad a numeric string S with zeros on the left, to fill a field\n\
12594of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595
12596static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012597unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012599 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012600 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012601 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 int kind;
12603 void *data;
12604 Py_UCS4 chr;
12605
12606 if (PyUnicode_READY(self) == -1)
12607 return NULL;
12608
Martin v. Löwis18e16552006-02-15 17:27:45 +000012609 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 return NULL;
12611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012613 if (PyUnicode_CheckExact(self)) {
12614 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012615 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012616 }
12617 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012618 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 }
12620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622
12623 u = pad(self, fill, 0, '0');
12624
Walter Dörwald068325e2002-04-15 13:36:47 +000012625 if (u == NULL)
12626 return NULL;
12627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 kind = PyUnicode_KIND(u);
12629 data = PyUnicode_DATA(u);
12630 chr = PyUnicode_READ(kind, data, fill);
12631
12632 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 PyUnicode_WRITE(kind, data, 0, chr);
12635 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 }
12637
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012638 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012639 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641
12642#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012643static PyObject *
12644unicode__decimal2ascii(PyObject *self)
12645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012647}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648#endif
12649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012650PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012653Return True if S starts with the specified prefix, False otherwise.\n\
12654With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012655With optional end, stop comparing S at that position.\n\
12656prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
12658static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012659unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012662 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012663 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012664 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012665 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
Jesus Ceaac451502011-04-20 17:09:23 +020012668 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012670 if (PyTuple_Check(subobj)) {
12671 Py_ssize_t i;
12672 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012673 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012674 if (substring == NULL)
12675 return NULL;
12676 result = tailmatch(self, substring, start, end, -1);
12677 Py_DECREF(substring);
12678 if (result) {
12679 Py_RETURN_TRUE;
12680 }
12681 }
12682 /* nothing matched */
12683 Py_RETURN_FALSE;
12684 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012685 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012686 if (substring == NULL) {
12687 if (PyErr_ExceptionMatches(PyExc_TypeError))
12688 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12689 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012691 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012692 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012694 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695}
12696
12697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012698PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012701Return True if S ends with the specified suffix, False otherwise.\n\
12702With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012703With optional end, stop comparing S at that position.\n\
12704suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705
12706static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012707unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012710 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012711 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012712 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012713 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012714 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715
Jesus Ceaac451502011-04-20 17:09:23 +020012716 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012718 if (PyTuple_Check(subobj)) {
12719 Py_ssize_t i;
12720 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012721 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012723 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012725 result = tailmatch(self, substring, start, end, +1);
12726 Py_DECREF(substring);
12727 if (result) {
12728 Py_RETURN_TRUE;
12729 }
12730 }
12731 Py_RETURN_FALSE;
12732 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012733 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012734 if (substring == NULL) {
12735 if (PyErr_ExceptionMatches(PyExc_TypeError))
12736 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12737 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012739 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012740 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743}
12744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012746
12747PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012749\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012750Return a formatted version of S, using substitutions from args and kwargs.\n\
12751The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012752
Eric Smith27bbca62010-11-04 17:06:58 +000012753PyDoc_STRVAR(format_map__doc__,
12754 "S.format_map(mapping) -> str\n\
12755\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012756Return a formatted version of S, using substitutions from mapping.\n\
12757The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012758
Eric Smith4a7d76d2008-05-30 18:10:19 +000012759static PyObject *
12760unicode__format__(PyObject* self, PyObject* args)
12761{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012762 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012763
12764 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12765 return NULL;
12766
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012767 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012769 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012770}
12771
Eric Smith8c663262007-08-25 02:26:07 +000012772PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012774\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012775Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012776
12777static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012779{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 Py_ssize_t size;
12781
12782 /* If it's a compact object, account for base structure +
12783 character data. */
12784 if (PyUnicode_IS_COMPACT_ASCII(v))
12785 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12786 else if (PyUnicode_IS_COMPACT(v))
12787 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012788 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 else {
12790 /* If it is a two-block object, account for base object, and
12791 for character block if present. */
12792 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012793 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012795 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796 }
12797 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012798 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012799 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012801 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012802 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803
12804 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012805}
12806
12807PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012809
12810static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012811unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012812{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012813 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 if (!copy)
12815 return NULL;
12816 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012817}
12818
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819static PyMethodDef unicode_methods[] = {
12820
12821 /* Order is according to common usage: often used methods should
12822 appear first, since lookup is done sequentially. */
12823
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012824 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012825 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12826 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012828 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12829 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12830 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12831 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12832 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12833 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12834 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012835 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012836 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12837 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12838 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012839 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012840 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12841 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12842 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012843 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012845 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012846 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012847 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12848 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12849 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12850 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12851 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12852 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12853 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12854 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12855 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12856 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12857 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12858 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12859 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12860 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012861 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012862 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012863 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012864 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012865 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012866 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012867 {"maketrans", (PyCFunction) unicode_maketrans,
12868 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012869 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012870#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012871 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872#endif
12873
12874#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012875 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012876 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877#endif
12878
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880 {NULL, NULL}
12881};
12882
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012883static PyObject *
12884unicode_mod(PyObject *v, PyObject *w)
12885{
Brian Curtindfc80e32011-08-10 20:28:54 -050012886 if (!PyUnicode_Check(v))
12887 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012889}
12890
12891static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012892 0, /*nb_add*/
12893 0, /*nb_subtract*/
12894 0, /*nb_multiply*/
12895 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012896};
12897
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012899 (lenfunc) unicode_length, /* sq_length */
12900 PyUnicode_Concat, /* sq_concat */
12901 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12902 (ssizeargfunc) unicode_getitem, /* sq_item */
12903 0, /* sq_slice */
12904 0, /* sq_ass_item */
12905 0, /* sq_ass_slice */
12906 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907};
12908
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012909static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012910unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 if (PyUnicode_READY(self) == -1)
12913 return NULL;
12914
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012915 if (PyIndex_Check(item)) {
12916 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012917 if (i == -1 && PyErr_Occurred())
12918 return NULL;
12919 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012921 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012922 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012923 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012924 PyObject *result;
12925 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012926 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012927 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012931 return NULL;
12932 }
12933
12934 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 return PyUnicode_New(0, 0);
12936 } else if (start == 0 && step == 1 &&
12937 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012938 PyUnicode_CheckExact(self)) {
12939 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012940 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012941 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012942 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012943 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012944 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012945 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012946 src_kind = PyUnicode_KIND(self);
12947 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012948 if (!PyUnicode_IS_ASCII(self)) {
12949 kind_limit = kind_maxchar_limit(src_kind);
12950 max_char = 0;
12951 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12952 ch = PyUnicode_READ(src_kind, src_data, cur);
12953 if (ch > max_char) {
12954 max_char = ch;
12955 if (max_char >= kind_limit)
12956 break;
12957 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012958 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012959 }
Victor Stinner55c99112011-10-13 01:17:06 +020012960 else
12961 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012962 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012963 if (result == NULL)
12964 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012965 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012966 dest_data = PyUnicode_DATA(result);
12967
12968 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012969 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12970 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012971 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012972 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012973 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012974 } else {
12975 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12976 return NULL;
12977 }
12978}
12979
12980static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981 (lenfunc)unicode_length, /* mp_length */
12982 (binaryfunc)unicode_subscript, /* mp_subscript */
12983 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012984};
12985
Guido van Rossumd57fd912000-03-10 22:53:23 +000012986
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987/* Helpers for PyUnicode_Format() */
12988
12989static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012990getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012992 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012993 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 (*p_argidx)++;
12995 if (arglen < 0)
12996 return args;
12997 else
12998 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012999 }
13000 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002 return NULL;
13003}
13004
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013005/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013007static PyObject *
13008formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013010 char *p;
13011 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013013
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014 x = PyFloat_AsDouble(v);
13015 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013016 return NULL;
13017
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013019 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013020
Eric Smith0923d1d2009-04-16 20:16:10 +000013021 p = PyOS_double_to_string(x, type, prec,
13022 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013023 if (p == NULL)
13024 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013026 PyMem_Free(p);
13027 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028}
13029
Tim Peters38fd5b62000-09-21 05:43:11 +000013030static PyObject*
13031formatlong(PyObject *val, int flags, int prec, int type)
13032{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013033 char *buf;
13034 int len;
13035 PyObject *str; /* temporary string object. */
13036 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013037
Benjamin Peterson14339b62009-01-31 16:36:08 +000013038 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13039 if (!str)
13040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013042 Py_DECREF(str);
13043 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013044}
13045
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013046static Py_UCS4
13047formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013049 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013050 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013052 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013053 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 goto onError;
13055 }
13056 else {
13057 /* Integer input truncated to a character */
13058 long x;
13059 x = PyLong_AsLong(v);
13060 if (x == -1 && PyErr_Occurred())
13061 goto onError;
13062
Victor Stinner8faf8212011-12-08 22:14:11 +010013063 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013064 PyErr_SetString(PyExc_OverflowError,
13065 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013066 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013067 }
13068
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013069 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013070 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013071
Benjamin Peterson29060642009-01-31 22:14:21 +000013072 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013073 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013075 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076}
13077
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013078static int
13079repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13080{
13081 int r;
13082 assert(count > 0);
13083 assert(PyUnicode_Check(obj));
13084 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013085 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013086 if (repeated == NULL)
13087 return -1;
13088 r = _PyAccu_Accumulate(acc, repeated);
13089 Py_DECREF(repeated);
13090 return r;
13091 }
13092 else {
13093 do {
13094 if (_PyAccu_Accumulate(acc, obj))
13095 return -1;
13096 } while (--count);
13097 return 0;
13098 }
13099}
13100
Alexander Belopolsky40018472011-02-26 01:02:56 +000013101PyObject *
13102PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 void *fmt;
13105 int fmtkind;
13106 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013107 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013108 int r;
13109 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013112 PyObject *temp = NULL;
13113 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013114 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013115 _PyAccu acc;
13116 static PyObject *plus, *minus, *blank, *zero, *percent;
13117
13118 if (!plus && !(plus = get_latin1_char('+')))
13119 return NULL;
13120 if (!minus && !(minus = get_latin1_char('-')))
13121 return NULL;
13122 if (!blank && !(blank = get_latin1_char(' ')))
13123 return NULL;
13124 if (!zero && !(zero = get_latin1_char('0')))
13125 return NULL;
13126 if (!percent && !(percent = get_latin1_char('%')))
13127 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013128
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013130 PyErr_BadInternalCall();
13131 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013133 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013136 if (_PyAccu_Init(&acc))
13137 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 fmt = PyUnicode_DATA(uformat);
13139 fmtkind = PyUnicode_KIND(uformat);
13140 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13141 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 arglen = PyTuple_Size(args);
13145 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146 }
13147 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 arglen = -1;
13149 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013151 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013152 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154
13155 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157 PyObject *nonfmt;
13158 Py_ssize_t nonfmtpos;
13159 nonfmtpos = fmtpos++;
13160 while (fmtcnt >= 0 &&
13161 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13162 fmtpos++;
13163 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013164 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013165 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013166 if (nonfmt == NULL)
13167 goto onError;
13168 r = _PyAccu_Accumulate(&acc, nonfmt);
13169 Py_DECREF(nonfmt);
13170 if (r)
13171 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 }
13173 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 /* Got a format specifier */
13175 int flags = 0;
13176 Py_ssize_t width = -1;
13177 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013179 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 int isnumok;
13181 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013182 void *pbuf = NULL;
13183 Py_ssize_t pindex, len;
13184 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 fmtpos++;
13187 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13188 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 Py_ssize_t keylen;
13190 PyObject *key;
13191 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013192
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 if (dict == NULL) {
13194 PyErr_SetString(PyExc_TypeError,
13195 "format requires a mapping");
13196 goto onError;
13197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013199 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 /* Skip over balanced parentheses */
13202 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013205 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013209 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 if (fmtcnt < 0 || pcount > 0) {
13211 PyErr_SetString(PyExc_ValueError,
13212 "incomplete format key");
13213 goto onError;
13214 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013215 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013216 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013217 if (key == NULL)
13218 goto onError;
13219 if (args_owned) {
13220 Py_DECREF(args);
13221 args_owned = 0;
13222 }
13223 args = PyObject_GetItem(dict, key);
13224 Py_DECREF(key);
13225 if (args == NULL) {
13226 goto onError;
13227 }
13228 args_owned = 1;
13229 arglen = -1;
13230 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013231 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 case '-': flags |= F_LJUST; continue;
13235 case '+': flags |= F_SIGN; continue;
13236 case ' ': flags |= F_BLANK; continue;
13237 case '#': flags |= F_ALT; continue;
13238 case '0': flags |= F_ZERO; continue;
13239 }
13240 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013241 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 if (c == '*') {
13243 v = getnextarg(args, arglen, &argidx);
13244 if (v == NULL)
13245 goto onError;
13246 if (!PyLong_Check(v)) {
13247 PyErr_SetString(PyExc_TypeError,
13248 "* wants int");
13249 goto onError;
13250 }
13251 width = PyLong_AsLong(v);
13252 if (width == -1 && PyErr_Occurred())
13253 goto onError;
13254 if (width < 0) {
13255 flags |= F_LJUST;
13256 width = -width;
13257 }
13258 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 }
13261 else if (c >= '0' && c <= '9') {
13262 width = c - '0';
13263 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013264 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 if (c < '0' || c > '9')
13266 break;
13267 if ((width*10) / 10 != width) {
13268 PyErr_SetString(PyExc_ValueError,
13269 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013271 }
13272 width = width*10 + (c - '0');
13273 }
13274 }
13275 if (c == '.') {
13276 prec = 0;
13277 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013279 if (c == '*') {
13280 v = getnextarg(args, arglen, &argidx);
13281 if (v == NULL)
13282 goto onError;
13283 if (!PyLong_Check(v)) {
13284 PyErr_SetString(PyExc_TypeError,
13285 "* wants int");
13286 goto onError;
13287 }
13288 prec = PyLong_AsLong(v);
13289 if (prec == -1 && PyErr_Occurred())
13290 goto onError;
13291 if (prec < 0)
13292 prec = 0;
13293 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 }
13296 else if (c >= '0' && c <= '9') {
13297 prec = c - '0';
13298 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 if (c < '0' || c > '9')
13301 break;
13302 if ((prec*10) / 10 != prec) {
13303 PyErr_SetString(PyExc_ValueError,
13304 "prec too big");
13305 goto onError;
13306 }
13307 prec = prec*10 + (c - '0');
13308 }
13309 }
13310 } /* prec */
13311 if (fmtcnt >= 0) {
13312 if (c == 'h' || c == 'l' || c == 'L') {
13313 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 }
13316 }
13317 if (fmtcnt < 0) {
13318 PyErr_SetString(PyExc_ValueError,
13319 "incomplete format");
13320 goto onError;
13321 }
13322 if (c != '%') {
13323 v = getnextarg(args, arglen, &argidx);
13324 if (v == NULL)
13325 goto onError;
13326 }
13327 sign = 0;
13328 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013329 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 switch (c) {
13331
13332 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013333 _PyAccu_Accumulate(&acc, percent);
13334 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013335
13336 case 's':
13337 case 'r':
13338 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013339 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013340 temp = v;
13341 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013342 }
13343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013344 if (c == 's')
13345 temp = PyObject_Str(v);
13346 else if (c == 'r')
13347 temp = PyObject_Repr(v);
13348 else
13349 temp = PyObject_ASCII(v);
13350 if (temp == NULL)
13351 goto onError;
13352 if (PyUnicode_Check(temp))
13353 /* nothing to do */;
13354 else {
13355 Py_DECREF(temp);
13356 PyErr_SetString(PyExc_TypeError,
13357 "%s argument has non-string str()");
13358 goto onError;
13359 }
13360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 if (PyUnicode_READY(temp) == -1) {
13362 Py_CLEAR(temp);
13363 goto onError;
13364 }
13365 pbuf = PyUnicode_DATA(temp);
13366 kind = PyUnicode_KIND(temp);
13367 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013368 if (prec >= 0 && len > prec)
13369 len = prec;
13370 break;
13371
13372 case 'i':
13373 case 'd':
13374 case 'u':
13375 case 'o':
13376 case 'x':
13377 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 isnumok = 0;
13379 if (PyNumber_Check(v)) {
13380 PyObject *iobj=NULL;
13381
13382 if (PyLong_Check(v)) {
13383 iobj = v;
13384 Py_INCREF(iobj);
13385 }
13386 else {
13387 iobj = PyNumber_Long(v);
13388 }
13389 if (iobj!=NULL) {
13390 if (PyLong_Check(iobj)) {
13391 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013392 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 Py_DECREF(iobj);
13394 if (!temp)
13395 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 if (PyUnicode_READY(temp) == -1) {
13397 Py_CLEAR(temp);
13398 goto onError;
13399 }
13400 pbuf = PyUnicode_DATA(temp);
13401 kind = PyUnicode_KIND(temp);
13402 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 sign = 1;
13404 }
13405 else {
13406 Py_DECREF(iobj);
13407 }
13408 }
13409 }
13410 if (!isnumok) {
13411 PyErr_Format(PyExc_TypeError,
13412 "%%%c format: a number is required, "
13413 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13414 goto onError;
13415 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013418 fillobj = zero;
13419 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 break;
13421
13422 case 'e':
13423 case 'E':
13424 case 'f':
13425 case 'F':
13426 case 'g':
13427 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013428 temp = formatfloat(v, flags, prec, c);
13429 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013431 if (PyUnicode_READY(temp) == -1) {
13432 Py_CLEAR(temp);
13433 goto onError;
13434 }
13435 pbuf = PyUnicode_DATA(temp);
13436 kind = PyUnicode_KIND(temp);
13437 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013439 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013441 fillobj = zero;
13442 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 break;
13444
13445 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013446 {
13447 Py_UCS4 ch = formatchar(v);
13448 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013450 temp = _PyUnicode_FromUCS4(&ch, 1);
13451 if (temp == NULL)
13452 goto onError;
13453 pbuf = PyUnicode_DATA(temp);
13454 kind = PyUnicode_KIND(temp);
13455 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013457 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013458
13459 default:
13460 PyErr_Format(PyExc_ValueError,
13461 "unsupported format character '%c' (0x%x) "
13462 "at index %zd",
13463 (31<=c && c<=126) ? (char)c : '?',
13464 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013466 goto onError;
13467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 /* pbuf is initialized here. */
13469 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013471 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13472 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013474 pindex++;
13475 }
13476 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13477 signobj = plus;
13478 len--;
13479 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 }
13481 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013482 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013484 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 else
13486 sign = 0;
13487 }
13488 if (width < len)
13489 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013491 if (fill != ' ') {
13492 assert(signobj != NULL);
13493 if (_PyAccu_Accumulate(&acc, signobj))
13494 goto onError;
13495 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 if (width > len)
13497 width--;
13498 }
13499 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013501 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013503 second = get_latin1_char(
13504 PyUnicode_READ(kind, pbuf, pindex + 1));
13505 pindex += 2;
13506 if (second == NULL ||
13507 _PyAccu_Accumulate(&acc, zero) ||
13508 _PyAccu_Accumulate(&acc, second))
13509 goto onError;
13510 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 width -= 2;
13513 if (width < 0)
13514 width = 0;
13515 len -= 2;
13516 }
13517 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013518 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013519 if (repeat_accumulate(&acc, fillobj, width - len))
13520 goto onError;
13521 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 }
13523 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013524 if (sign) {
13525 assert(signobj != NULL);
13526 if (_PyAccu_Accumulate(&acc, signobj))
13527 goto onError;
13528 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013530 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13531 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013532 second = get_latin1_char(
13533 PyUnicode_READ(kind, pbuf, pindex + 1));
13534 pindex += 2;
13535 if (second == NULL ||
13536 _PyAccu_Accumulate(&acc, zero) ||
13537 _PyAccu_Accumulate(&acc, second))
13538 goto onError;
13539 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013540 }
13541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013542 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013543 if (temp != NULL) {
13544 assert(pbuf == PyUnicode_DATA(temp));
13545 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013546 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013547 else {
13548 const char *p = (const char *) pbuf;
13549 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013550 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 v = PyUnicode_FromKindAndData(kind, p, len);
13552 }
13553 if (v == NULL)
13554 goto onError;
13555 r = _PyAccu_Accumulate(&acc, v);
13556 Py_DECREF(v);
13557 if (r)
13558 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013559 if (width > len && repeat_accumulate(&acc, blank, width - len))
13560 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 if (dict && (argidx < arglen) && c != '%') {
13562 PyErr_SetString(PyExc_TypeError,
13563 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 goto onError;
13565 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013566 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013568 } /* until end */
13569 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 PyErr_SetString(PyExc_TypeError,
13571 "not all arguments converted during string formatting");
13572 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013573 }
13574
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013576 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013578 }
13579 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013580 Py_XDECREF(temp);
13581 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013582 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013585 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013586 Py_XDECREF(temp);
13587 Py_XDECREF(second);
13588 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013589 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591 }
13592 return NULL;
13593}
13594
Jeremy Hylton938ace62002-07-17 16:30:39 +000013595static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013596unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13597
Tim Peters6d6c1a32001-08-02 04:15:00 +000013598static PyObject *
13599unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13600{
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 static char *kwlist[] = {"object", "encoding", "errors", 0};
13603 char *encoding = NULL;
13604 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013605
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 if (type != &PyUnicode_Type)
13607 return unicode_subtype_new(type, args, kwds);
13608 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013610 return NULL;
13611 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013612 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013613 if (encoding == NULL && errors == NULL)
13614 return PyObject_Str(x);
13615 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013617}
13618
Guido van Rossume023fe02001-08-30 03:12:59 +000013619static PyObject *
13620unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13621{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013622 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013623 Py_ssize_t length, char_size;
13624 int share_wstr, share_utf8;
13625 unsigned int kind;
13626 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013627
Benjamin Peterson14339b62009-01-31 16:36:08 +000013628 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013629
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013630 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013631 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013632 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013633 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013634 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635 return NULL;
13636
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013637 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013638 if (self == NULL) {
13639 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013640 return NULL;
13641 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013642 kind = PyUnicode_KIND(unicode);
13643 length = PyUnicode_GET_LENGTH(unicode);
13644
13645 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013646#ifdef Py_DEBUG
13647 _PyUnicode_HASH(self) = -1;
13648#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013649 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013650#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013651 _PyUnicode_STATE(self).interned = 0;
13652 _PyUnicode_STATE(self).kind = kind;
13653 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013654 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013655 _PyUnicode_STATE(self).ready = 1;
13656 _PyUnicode_WSTR(self) = NULL;
13657 _PyUnicode_UTF8_LENGTH(self) = 0;
13658 _PyUnicode_UTF8(self) = NULL;
13659 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013660 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013661
13662 share_utf8 = 0;
13663 share_wstr = 0;
13664 if (kind == PyUnicode_1BYTE_KIND) {
13665 char_size = 1;
13666 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13667 share_utf8 = 1;
13668 }
13669 else if (kind == PyUnicode_2BYTE_KIND) {
13670 char_size = 2;
13671 if (sizeof(wchar_t) == 2)
13672 share_wstr = 1;
13673 }
13674 else {
13675 assert(kind == PyUnicode_4BYTE_KIND);
13676 char_size = 4;
13677 if (sizeof(wchar_t) == 4)
13678 share_wstr = 1;
13679 }
13680
13681 /* Ensure we won't overflow the length. */
13682 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13683 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013684 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013685 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013686 data = PyObject_MALLOC((length + 1) * char_size);
13687 if (data == NULL) {
13688 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013689 goto onError;
13690 }
13691
Victor Stinnerc3c74152011-10-02 20:39:55 +020013692 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013693 if (share_utf8) {
13694 _PyUnicode_UTF8_LENGTH(self) = length;
13695 _PyUnicode_UTF8(self) = data;
13696 }
13697 if (share_wstr) {
13698 _PyUnicode_WSTR_LENGTH(self) = length;
13699 _PyUnicode_WSTR(self) = (wchar_t *)data;
13700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013701
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013702 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013703 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013704 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013705#ifdef Py_DEBUG
13706 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13707#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013708 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013709 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013710
13711onError:
13712 Py_DECREF(unicode);
13713 Py_DECREF(self);
13714 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013715}
13716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013717PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013719\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013720Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013721encoding defaults to the current default string encoding.\n\
13722errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013723
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013724static PyObject *unicode_iter(PyObject *seq);
13725
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013727 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013728 "str", /* tp_name */
13729 sizeof(PyUnicodeObject), /* tp_size */
13730 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013731 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 (destructor)unicode_dealloc, /* tp_dealloc */
13733 0, /* tp_print */
13734 0, /* tp_getattr */
13735 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013736 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 unicode_repr, /* tp_repr */
13738 &unicode_as_number, /* tp_as_number */
13739 &unicode_as_sequence, /* tp_as_sequence */
13740 &unicode_as_mapping, /* tp_as_mapping */
13741 (hashfunc) unicode_hash, /* tp_hash*/
13742 0, /* tp_call*/
13743 (reprfunc) unicode_str, /* tp_str */
13744 PyObject_GenericGetAttr, /* tp_getattro */
13745 0, /* tp_setattro */
13746 0, /* tp_as_buffer */
13747 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013748 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013749 unicode_doc, /* tp_doc */
13750 0, /* tp_traverse */
13751 0, /* tp_clear */
13752 PyUnicode_RichCompare, /* tp_richcompare */
13753 0, /* tp_weaklistoffset */
13754 unicode_iter, /* tp_iter */
13755 0, /* tp_iternext */
13756 unicode_methods, /* tp_methods */
13757 0, /* tp_members */
13758 0, /* tp_getset */
13759 &PyBaseObject_Type, /* tp_base */
13760 0, /* tp_dict */
13761 0, /* tp_descr_get */
13762 0, /* tp_descr_set */
13763 0, /* tp_dictoffset */
13764 0, /* tp_init */
13765 0, /* tp_alloc */
13766 unicode_new, /* tp_new */
13767 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013768};
13769
13770/* Initialize the Unicode implementation */
13771
Victor Stinner3a50e702011-10-18 21:21:00 +020013772int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013774 int i;
13775
Thomas Wouters477c8d52006-05-27 19:21:47 +000013776 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013777 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013778 0x000A, /* LINE FEED */
13779 0x000D, /* CARRIAGE RETURN */
13780 0x001C, /* FILE SEPARATOR */
13781 0x001D, /* GROUP SEPARATOR */
13782 0x001E, /* RECORD SEPARATOR */
13783 0x0085, /* NEXT LINE */
13784 0x2028, /* LINE SEPARATOR */
13785 0x2029, /* PARAGRAPH SEPARATOR */
13786 };
13787
Fred Drakee4315f52000-05-09 19:53:39 +000013788 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013789 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013790 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013792 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013793
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013794 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013795 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013796 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013797 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013798
13799 /* initialize the linebreak bloom filter */
13800 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013802 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013803
13804 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013805
13806#ifdef HAVE_MBCS
13807 winver.dwOSVersionInfoSize = sizeof(winver);
13808 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13809 PyErr_SetFromWindowsErr(0);
13810 return -1;
13811 }
13812#endif
13813 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814}
13815
13816/* Finalize the Unicode implementation */
13817
Christian Heimesa156e092008-02-16 07:38:31 +000013818int
13819PyUnicode_ClearFreeList(void)
13820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013821 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013822}
13823
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824void
Thomas Wouters78890102000-07-22 19:25:51 +000013825_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013827 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013828
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013829 Py_XDECREF(unicode_empty);
13830 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013831
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013832 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013833 if (unicode_latin1[i]) {
13834 Py_DECREF(unicode_latin1[i]);
13835 unicode_latin1[i] = NULL;
13836 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013837 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013838 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013839 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013841
Walter Dörwald16807132007-05-25 13:52:07 +000013842void
13843PyUnicode_InternInPlace(PyObject **p)
13844{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013845 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013846 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013847#ifdef Py_DEBUG
13848 assert(s != NULL);
13849 assert(_PyUnicode_CHECK(s));
13850#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013851 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013852 return;
13853#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013854 /* If it's a subclass, we don't really know what putting
13855 it in the interned dict might do. */
13856 if (!PyUnicode_CheckExact(s))
13857 return;
13858 if (PyUnicode_CHECK_INTERNED(s))
13859 return;
13860 if (interned == NULL) {
13861 interned = PyDict_New();
13862 if (interned == NULL) {
13863 PyErr_Clear(); /* Don't leave an exception */
13864 return;
13865 }
13866 }
13867 /* It might be that the GetItem call fails even
13868 though the key is present in the dictionary,
13869 namely when this happens during a stack overflow. */
13870 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013871 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013872 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013873
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 if (t) {
13875 Py_INCREF(t);
13876 Py_DECREF(*p);
13877 *p = t;
13878 return;
13879 }
Walter Dörwald16807132007-05-25 13:52:07 +000013880
Benjamin Peterson14339b62009-01-31 16:36:08 +000013881 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013882 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013883 PyErr_Clear();
13884 PyThreadState_GET()->recursion_critical = 0;
13885 return;
13886 }
13887 PyThreadState_GET()->recursion_critical = 0;
13888 /* The two references in interned are not counted by refcnt.
13889 The deallocator will take care of this */
13890 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013891 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013892}
13893
13894void
13895PyUnicode_InternImmortal(PyObject **p)
13896{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 PyUnicode_InternInPlace(p);
13898 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013899 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013900 Py_INCREF(*p);
13901 }
Walter Dörwald16807132007-05-25 13:52:07 +000013902}
13903
13904PyObject *
13905PyUnicode_InternFromString(const char *cp)
13906{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013907 PyObject *s = PyUnicode_FromString(cp);
13908 if (s == NULL)
13909 return NULL;
13910 PyUnicode_InternInPlace(&s);
13911 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013912}
13913
Alexander Belopolsky40018472011-02-26 01:02:56 +000013914void
13915_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013916{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013917 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013918 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 Py_ssize_t i, n;
13920 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013921
Benjamin Peterson14339b62009-01-31 16:36:08 +000013922 if (interned == NULL || !PyDict_Check(interned))
13923 return;
13924 keys = PyDict_Keys(interned);
13925 if (keys == NULL || !PyList_Check(keys)) {
13926 PyErr_Clear();
13927 return;
13928 }
Walter Dörwald16807132007-05-25 13:52:07 +000013929
Benjamin Peterson14339b62009-01-31 16:36:08 +000013930 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13931 detector, interned unicode strings are not forcibly deallocated;
13932 rather, we give them their stolen references back, and then clear
13933 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013934
Benjamin Peterson14339b62009-01-31 16:36:08 +000013935 n = PyList_GET_SIZE(keys);
13936 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013938 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013939 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013940 if (PyUnicode_READY(s) == -1) {
13941 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013942 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013944 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013945 case SSTATE_NOT_INTERNED:
13946 /* XXX Shouldn't happen */
13947 break;
13948 case SSTATE_INTERNED_IMMORTAL:
13949 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013950 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013951 break;
13952 case SSTATE_INTERNED_MORTAL:
13953 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 break;
13956 default:
13957 Py_FatalError("Inconsistent interned string state.");
13958 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013959 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013960 }
13961 fprintf(stderr, "total size of all interned strings: "
13962 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13963 "mortal/immortal\n", mortal_size, immortal_size);
13964 Py_DECREF(keys);
13965 PyDict_Clear(interned);
13966 Py_DECREF(interned);
13967 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013968}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013969
13970
13971/********************* Unicode Iterator **************************/
13972
13973typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013974 PyObject_HEAD
13975 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013976 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013977} unicodeiterobject;
13978
13979static void
13980unicodeiter_dealloc(unicodeiterobject *it)
13981{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 _PyObject_GC_UNTRACK(it);
13983 Py_XDECREF(it->it_seq);
13984 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013985}
13986
13987static int
13988unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13989{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 Py_VISIT(it->it_seq);
13991 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013992}
13993
13994static PyObject *
13995unicodeiter_next(unicodeiterobject *it)
13996{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013997 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013998
Benjamin Peterson14339b62009-01-31 16:36:08 +000013999 assert(it != NULL);
14000 seq = it->it_seq;
14001 if (seq == NULL)
14002 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014003 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014005 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14006 int kind = PyUnicode_KIND(seq);
14007 void *data = PyUnicode_DATA(seq);
14008 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14009 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 if (item != NULL)
14011 ++it->it_index;
14012 return item;
14013 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014014
Benjamin Peterson14339b62009-01-31 16:36:08 +000014015 Py_DECREF(seq);
14016 it->it_seq = NULL;
14017 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014018}
14019
14020static PyObject *
14021unicodeiter_len(unicodeiterobject *it)
14022{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 Py_ssize_t len = 0;
14024 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014025 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014027}
14028
14029PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14030
14031static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014033 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014035};
14036
14037PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14039 "str_iterator", /* tp_name */
14040 sizeof(unicodeiterobject), /* tp_basicsize */
14041 0, /* tp_itemsize */
14042 /* methods */
14043 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14044 0, /* tp_print */
14045 0, /* tp_getattr */
14046 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014047 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 0, /* tp_repr */
14049 0, /* tp_as_number */
14050 0, /* tp_as_sequence */
14051 0, /* tp_as_mapping */
14052 0, /* tp_hash */
14053 0, /* tp_call */
14054 0, /* tp_str */
14055 PyObject_GenericGetAttr, /* tp_getattro */
14056 0, /* tp_setattro */
14057 0, /* tp_as_buffer */
14058 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14059 0, /* tp_doc */
14060 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14061 0, /* tp_clear */
14062 0, /* tp_richcompare */
14063 0, /* tp_weaklistoffset */
14064 PyObject_SelfIter, /* tp_iter */
14065 (iternextfunc)unicodeiter_next, /* tp_iternext */
14066 unicodeiter_methods, /* tp_methods */
14067 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014068};
14069
14070static PyObject *
14071unicode_iter(PyObject *seq)
14072{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014074
Benjamin Peterson14339b62009-01-31 16:36:08 +000014075 if (!PyUnicode_Check(seq)) {
14076 PyErr_BadInternalCall();
14077 return NULL;
14078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014079 if (PyUnicode_READY(seq) == -1)
14080 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14082 if (it == NULL)
14083 return NULL;
14084 it->it_index = 0;
14085 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014086 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 _PyObject_GC_TRACK(it);
14088 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089}
14090
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014091
14092size_t
14093Py_UNICODE_strlen(const Py_UNICODE *u)
14094{
14095 int res = 0;
14096 while(*u++)
14097 res++;
14098 return res;
14099}
14100
14101Py_UNICODE*
14102Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14103{
14104 Py_UNICODE *u = s1;
14105 while ((*u++ = *s2++));
14106 return s1;
14107}
14108
14109Py_UNICODE*
14110Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14111{
14112 Py_UNICODE *u = s1;
14113 while ((*u++ = *s2++))
14114 if (n-- == 0)
14115 break;
14116 return s1;
14117}
14118
14119Py_UNICODE*
14120Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14121{
14122 Py_UNICODE *u1 = s1;
14123 u1 += Py_UNICODE_strlen(u1);
14124 Py_UNICODE_strcpy(u1, s2);
14125 return s1;
14126}
14127
14128int
14129Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14130{
14131 while (*s1 && *s2 && *s1 == *s2)
14132 s1++, s2++;
14133 if (*s1 && *s2)
14134 return (*s1 < *s2) ? -1 : +1;
14135 if (*s1)
14136 return 1;
14137 if (*s2)
14138 return -1;
14139 return 0;
14140}
14141
14142int
14143Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14144{
14145 register Py_UNICODE u1, u2;
14146 for (; n != 0; n--) {
14147 u1 = *s1;
14148 u2 = *s2;
14149 if (u1 != u2)
14150 return (u1 < u2) ? -1 : +1;
14151 if (u1 == '\0')
14152 return 0;
14153 s1++;
14154 s2++;
14155 }
14156 return 0;
14157}
14158
14159Py_UNICODE*
14160Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14161{
14162 const Py_UNICODE *p;
14163 for (p = s; *p; p++)
14164 if (*p == c)
14165 return (Py_UNICODE*)p;
14166 return NULL;
14167}
14168
14169Py_UNICODE*
14170Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14171{
14172 const Py_UNICODE *p;
14173 p = s + Py_UNICODE_strlen(s);
14174 while (p != s) {
14175 p--;
14176 if (*p == c)
14177 return (Py_UNICODE*)p;
14178 }
14179 return NULL;
14180}
Victor Stinner331ea922010-08-10 16:37:20 +000014181
Victor Stinner71133ff2010-09-01 23:43:53 +000014182Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014183PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014184{
Victor Stinner577db2c2011-10-11 22:12:48 +020014185 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014186 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014188 if (!PyUnicode_Check(unicode)) {
14189 PyErr_BadArgument();
14190 return NULL;
14191 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014192 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014193 if (u == NULL)
14194 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014195 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014196 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014197 PyErr_NoMemory();
14198 return NULL;
14199 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014200 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014201 size *= sizeof(Py_UNICODE);
14202 copy = PyMem_Malloc(size);
14203 if (copy == NULL) {
14204 PyErr_NoMemory();
14205 return NULL;
14206 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014207 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014208 return copy;
14209}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014210
Georg Brandl66c221e2010-10-14 07:04:07 +000014211/* A _string module, to export formatter_parser and formatter_field_name_split
14212 to the string.Formatter class implemented in Python. */
14213
14214static PyMethodDef _string_methods[] = {
14215 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14216 METH_O, PyDoc_STR("split the argument as a field name")},
14217 {"formatter_parser", (PyCFunction) formatter_parser,
14218 METH_O, PyDoc_STR("parse the argument as a format string")},
14219 {NULL, NULL}
14220};
14221
14222static struct PyModuleDef _string_module = {
14223 PyModuleDef_HEAD_INIT,
14224 "_string",
14225 PyDoc_STR("string helper module"),
14226 0,
14227 _string_methods,
14228 NULL,
14229 NULL,
14230 NULL,
14231 NULL
14232};
14233
14234PyMODINIT_FUNC
14235PyInit__string(void)
14236{
14237 return PyModule_Create(&_string_module);
14238}
14239
14240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014241#ifdef __cplusplus
14242}
14243#endif