blob: 40b2a8839088c7ae17a141db5899535b8c657ed2 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200114#define _PyUnicode_WSTR(op) \
115 (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op) \
117 (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op) \
119 (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op) \
121 (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op) \
123 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200124#define _PyUnicode_KIND(op) \
125 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_GET_LENGTH(op) \
128 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200129 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200130#define _PyUnicode_DATA_ANY(op) \
131 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132
Victor Stinner910337b2011-10-03 03:20:16 +0200133#undef PyUnicode_READY
134#define PyUnicode_READY(op) \
135 (assert(_PyUnicode_CHECK(op)), \
136 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200137 0 : \
138 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200139
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200140#define _PyUnicode_READY_REPLACE(p_obj) \
141 (assert(_PyUnicode_CHECK(*p_obj)), \
142 (PyUnicode_IS_READY(*p_obj) ? \
143 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 (!PyUnicode_IS_COMPACT_ASCII(op) \
158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 (_PyUnicode_WSTR(op) && \
166 (!PyUnicode_IS_READY(op) || \
167 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
Victor Stinner910337b2011-10-03 03:20:16 +0200169/* Generic helper macro to convert characters of different types.
170 from_type and to_type have to be valid type names, begin and end
171 are pointers to the source characters which should be of type
172 "from_type *". to is a pointer of type "to_type *" and points to the
173 buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175 do { \
176 const from_type *iter_; to_type *to_; \
177 for (iter_ = (begin), to_ = (to_type *)(to); \
178 iter_ < (end); \
179 ++iter_, ++to_) { \
180 *to_ = (to_type)*iter_; \
181 } \
182 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200183
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
Walter Dörwald16807132007-05-25 13:52:07 +0000187/* This dictionary holds all interned unicode strings. Note that references
188 to strings in this dictionary are *not* counted in the string's ob_refcnt.
189 When the interned string reaches a refcnt of 0 the string deallocation
190 function will delete the reference from this dictionary.
191
192 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000193 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000194*/
195static PyObject *interned;
196
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000197/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200198static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000199
200/* Single character Unicode strings in the Latin-1 range are being
201 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
Christian Heimes190d79e2008-01-30 11:58:22 +0000204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000206 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000207/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000208/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000C: * FORM FEED */
211/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 1, 1, 1, 1, 1, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x001C: * FILE SEPARATOR */
215/* case 0x001D: * GROUP SEPARATOR */
216/* case 0x001E: * RECORD SEPARATOR */
217/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000218 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000219/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 1, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000224
Benjamin Peterson14339b62009-01-31 16:36:08 +0000225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000233};
234
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200235/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
242 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
248 const Py_UNICODE *unicode, Py_ssize_t size,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
322 } else {
323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
330 assert(ascii->wstr != NULL);
331 assert(data == NULL);
332 assert(compact->utf8 == NULL);
333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
370 return 1;
371}
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376 return 1;
377}
Victor Stinner910337b2011-10-03 03:20:16 +0200378#endif
379
Thomas Wouters477c8d52006-05-27 19:21:47 +0000380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383 to keep things simple, we use a single bitmask, using the least 5
384 bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
Antoine Pitrouf068f942010-01-13 14:19:12 +0000388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
Thomas Wouters477c8d52006-05-27 19:21:47 +0000398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
Antoine Pitrouf068f942010-01-13 14:19:12 +0000402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405#define BLOOM_LINEBREAK(ch) \
406 ((ch) < 128U ? ascii_linebreak[(ch)] : \
407 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000408
Alexander Belopolsky40018472011-02-26 01:02:56 +0000409Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000411{
412 /* calculate simple bloom-style bitmask for a given unicode string */
413
Antoine Pitrouf068f942010-01-13 14:19:12 +0000414 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000415 Py_ssize_t i;
416
417 mask = 0;
418 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200419 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420
421 return mask;
422}
423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200424#define BLOOM_MEMBER(mask, chr, str) \
425 (BLOOM(mask, chr) \
426 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000427
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428/* --- Unicode Object ----------------------------------------------------- */
429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200430static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434 Py_ssize_t size, Py_UCS4 ch,
435 int direction)
436{
437 /* like wcschr, but doesn't stop at NULL characters */
438 Py_ssize_t i;
439 if (direction == 1) {
440 for(i = 0; i < size; i++)
441 if (PyUnicode_READ(kind, s, i) == ch)
442 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443 }
444 else {
445 for(i = size-1; i >= 0; i--)
446 if (PyUnicode_READ(kind, s, i) == ch)
447 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448 }
449 return NULL;
450}
451
Victor Stinnerfe226c02011-10-03 03:52:20 +0200452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455 Py_ssize_t char_size;
456 Py_ssize_t struct_size;
457 Py_ssize_t new_size;
458 int share_wstr;
459
460 assert(PyUnicode_IS_READY(unicode));
461 char_size = PyUnicode_CHARACTER_SIZE(unicode);
462 if (PyUnicode_IS_COMPACT_ASCII(unicode))
463 struct_size = sizeof(PyASCIIObject);
464 else
465 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200466 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200467
468 _Py_DEC_REFTOTAL;
469 _Py_ForgetReference(unicode);
470
471 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472 PyErr_NoMemory();
473 return NULL;
474 }
475 new_size = (struct_size + (length + 1) * char_size);
476
477 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478 if (unicode == NULL) {
479 PyObject_Del(unicode);
480 PyErr_NoMemory();
481 return NULL;
482 }
483 _Py_NewReference(unicode);
484 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200485 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200487 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491 length, 0);
492 return unicode;
493}
494
Alexander Belopolsky40018472011-02-26 01:02:56 +0000495static int
Victor Stinner95663112011-10-04 01:03:50 +0200496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497{
Victor Stinner95663112011-10-04 01:03:50 +0200498 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200499 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200500 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000501
Victor Stinner95663112011-10-04 01:03:50 +0200502 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200503
504 if (PyUnicode_IS_READY(unicode)) {
505 Py_ssize_t char_size;
506 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200507 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 void *data;
509
510 data = _PyUnicode_DATA_ANY(unicode);
511 assert(data != NULL);
512 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200513 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200515 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516 {
517 PyObject_DEL(_PyUnicode_UTF8(unicode));
518 _PyUnicode_UTF8(unicode) = NULL;
519 _PyUnicode_UTF8_LENGTH(unicode) = 0;
520 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200521
522 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523 PyErr_NoMemory();
524 return -1;
525 }
526 new_size = (length + 1) * char_size;
527
528 data = (PyObject *)PyObject_REALLOC(data, new_size);
529 if (data == NULL) {
530 PyErr_NoMemory();
531 return -1;
532 }
533 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200534 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200535 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200536 _PyUnicode_WSTR_LENGTH(unicode) = length;
537 }
538 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200539 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200540 _PyUnicode_UTF8_LENGTH(unicode) = length;
541 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200542 _PyUnicode_LENGTH(unicode) = length;
543 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200544 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400545 _PyUnicode_CheckConsistency(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200546 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200547 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200548 }
Victor Stinner95663112011-10-04 01:03:50 +0200549 assert(_PyUnicode_WSTR(unicode) != NULL);
550
551 /* check for integer overflow */
552 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553 PyErr_NoMemory();
554 return -1;
555 }
556 wstr = _PyUnicode_WSTR(unicode);
557 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558 if (!wstr) {
559 PyErr_NoMemory();
560 return -1;
561 }
562 _PyUnicode_WSTR(unicode) = wstr;
563 _PyUnicode_WSTR(unicode)[length] = 0;
564 _PyUnicode_WSTR_LENGTH(unicode) = length;
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400565 _PyUnicode_CheckConsistency(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 return 0;
567}
568
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572 Py_ssize_t copy_length;
573 if (PyUnicode_IS_COMPACT(unicode)) {
574 PyObject *copy;
575 assert(PyUnicode_IS_READY(unicode));
576
577 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578 if (copy == NULL)
579 return NULL;
580
581 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582 if (PyUnicode_CopyCharacters(copy, 0,
583 unicode, 0,
584 copy_length) < 0)
585 {
586 Py_DECREF(copy);
587 return NULL;
588 }
589 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200590 }
591 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200592 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200593 assert(_PyUnicode_WSTR(unicode) != NULL);
594 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200595 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200596 if (w == NULL)
597 return NULL;
598 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599 copy_length = Py_MIN(copy_length, length);
600 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601 copy_length);
602 return (PyObject*)w;
603 }
604}
605
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000607 Ux0000 terminated; some code (e.g. new_identifier)
608 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000609
610 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000611 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612
613*/
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
Alexander Belopolsky40018472011-02-26 01:02:56 +0000619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621{
622 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000624
Thomas Wouters477c8d52006-05-27 19:21:47 +0000625 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 if (length == 0 && unicode_empty != NULL) {
627 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200628 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629 }
630
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000631 /* Ensure we won't overflow the size. */
632 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633 return (PyUnicodeObject *)PyErr_NoMemory();
634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 if (length < 0) {
636 PyErr_SetString(PyExc_SystemError,
637 "Negative size passed to _PyUnicode_New");
638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000639 }
640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641#ifdef Py_DEBUG
642 ++unicode_old_new_calls;
643#endif
644
645 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646 if (unicode == NULL)
647 return NULL;
648 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000651 PyErr_NoMemory();
652 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654
Jeremy Hyltond8082792003-09-16 19:41:39 +0000655 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000656 * the caller fails before initializing str -- unicode_resize()
657 * reads str[0], and the Keep-Alive optimization can keep memory
658 * allocated for str alive across a call to unicode_dealloc(unicode).
659 * We don't want unicode_resize to read uninitialized memory in
660 * that case.
661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200662 _PyUnicode_WSTR(unicode)[0] = 0;
663 _PyUnicode_WSTR(unicode)[length] = 0;
664 _PyUnicode_WSTR_LENGTH(unicode) = length;
665 _PyUnicode_HASH(unicode) = -1;
666 _PyUnicode_STATE(unicode).interned = 0;
667 _PyUnicode_STATE(unicode).kind = 0;
668 _PyUnicode_STATE(unicode).compact = 0;
669 _PyUnicode_STATE(unicode).ready = 0;
670 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200671 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200673 _PyUnicode_UTF8(unicode) = NULL;
674 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000676
Benjamin Peterson29060642009-01-31 22:14:21 +0000677 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000678 /* XXX UNREF/NEWREF interface should be more symmetrical */
679 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000680 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000681 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000682 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000683}
684
Victor Stinnerf42dc442011-10-02 23:33:16 +0200685static const char*
686unicode_kind_name(PyObject *unicode)
687{
Victor Stinner42dfd712011-10-03 14:41:45 +0200688 /* don't check consistency: unicode_kind_name() is called from
689 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200690 if (!PyUnicode_IS_COMPACT(unicode))
691 {
692 if (!PyUnicode_IS_READY(unicode))
693 return "wstr";
694 switch(PyUnicode_KIND(unicode))
695 {
696 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200697 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200698 return "legacy ascii";
699 else
700 return "legacy latin1";
701 case PyUnicode_2BYTE_KIND:
702 return "legacy UCS2";
703 case PyUnicode_4BYTE_KIND:
704 return "legacy UCS4";
705 default:
706 return "<legacy invalid kind>";
707 }
708 }
709 assert(PyUnicode_IS_READY(unicode));
710 switch(PyUnicode_KIND(unicode))
711 {
712 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200714 return "ascii";
715 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200716 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200717 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200718 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200719 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200720 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200721 default:
722 return "<invalid compact kind>";
723 }
724}
725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200731 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735 return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738 printf("obj %p\n", unicode);
739 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744 return PyUnicode_DATA(unicode);
745}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200751 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753 void *data;
754 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755 if (ascii->state.compact)
756 data = (compact + 1);
757 else
758 data = unicode->data.any;
759 if (ascii->wstr == data)
760 printf("shared ");
761 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200762 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200763 printf(" (%zu), ", compact->wstr_length);
764 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765 printf("shared ");
766 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200767 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200768 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200769}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775 PyObject *obj;
776 PyCompactUnicodeObject *unicode;
777 void *data;
778 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200779 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200780 Py_ssize_t char_size;
781 Py_ssize_t struct_size;
782
783 /* Optimization for empty strings */
784 if (size == 0 && unicode_empty != NULL) {
785 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200786 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787 }
788
789#ifdef Py_DEBUG
790 ++unicode_new_new_calls;
791#endif
792
Victor Stinner9e9d6892011-10-04 01:02:02 +0200793 is_ascii = 0;
794 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 struct_size = sizeof(PyCompactUnicodeObject);
796 if (maxchar < 128) {
797 kind_state = PyUnicode_1BYTE_KIND;
798 char_size = 1;
799 is_ascii = 1;
800 struct_size = sizeof(PyASCIIObject);
801 }
802 else if (maxchar < 256) {
803 kind_state = PyUnicode_1BYTE_KIND;
804 char_size = 1;
805 }
806 else if (maxchar < 65536) {
807 kind_state = PyUnicode_2BYTE_KIND;
808 char_size = 2;
809 if (sizeof(wchar_t) == 2)
810 is_sharing = 1;
811 }
812 else {
813 kind_state = PyUnicode_4BYTE_KIND;
814 char_size = 4;
815 if (sizeof(wchar_t) == 4)
816 is_sharing = 1;
817 }
818
819 /* Ensure we won't overflow the size. */
820 if (size < 0) {
821 PyErr_SetString(PyExc_SystemError,
822 "Negative size passed to PyUnicode_New");
823 return NULL;
824 }
825 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826 return PyErr_NoMemory();
827
828 /* Duplicated allocation code from _PyObject_New() instead of a call to
829 * PyObject_New() so we are able to allocate space for the object and
830 * it's data buffer.
831 */
832 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833 if (obj == NULL)
834 return PyErr_NoMemory();
835 obj = PyObject_INIT(obj, &PyUnicode_Type);
836 if (obj == NULL)
837 return NULL;
838
839 unicode = (PyCompactUnicodeObject *)obj;
840 if (is_ascii)
841 data = ((PyASCIIObject*)obj) + 1;
842 else
843 data = unicode + 1;
844 _PyUnicode_LENGTH(unicode) = size;
845 _PyUnicode_HASH(unicode) = -1;
846 _PyUnicode_STATE(unicode).interned = 0;
847 _PyUnicode_STATE(unicode).kind = kind_state;
848 _PyUnicode_STATE(unicode).compact = 1;
849 _PyUnicode_STATE(unicode).ready = 1;
850 _PyUnicode_STATE(unicode).ascii = is_ascii;
851 if (is_ascii) {
852 ((char*)data)[size] = 0;
853 _PyUnicode_WSTR(unicode) = NULL;
854 }
855 else if (kind_state == PyUnicode_1BYTE_KIND) {
856 ((char*)data)[size] = 0;
857 _PyUnicode_WSTR(unicode) = NULL;
858 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200860 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861 }
862 else {
863 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200864 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 if (kind_state == PyUnicode_2BYTE_KIND)
866 ((Py_UCS2*)data)[size] = 0;
867 else /* kind_state == PyUnicode_4BYTE_KIND */
868 ((Py_UCS4*)data)[size] = 0;
869 if (is_sharing) {
870 _PyUnicode_WSTR_LENGTH(unicode) = size;
871 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872 }
873 else {
874 _PyUnicode_WSTR_LENGTH(unicode) = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 }
878 return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883 will decode surrogate pairs, the other conversions are implemented as macros
884 for efficency.
885
886 This function assumes that unicode can hold one more code point than wstr
887 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200888static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890 PyUnicodeObject *unicode)
891{
892 const wchar_t *iter;
893 Py_UCS4 *ucs4_out;
894
Victor Stinner910337b2011-10-03 03:20:16 +0200895 assert(unicode != NULL);
896 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900 for (iter = begin; iter < end; ) {
901 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902 _PyUnicode_GET_LENGTH(unicode)));
903 if (*iter >= 0xD800 && *iter <= 0xDBFF
904 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905 {
906 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907 iter += 2;
908 }
909 else {
910 *ucs4_out++ = *iter;
911 iter++;
912 }
913 }
914 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915 _PyUnicode_GET_LENGTH(unicode)));
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917}
918#endif
919
Victor Stinnercd9950f2011-10-02 00:34:53 +0200920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
Victor Stinner910337b2011-10-03 03:20:16 +0200923 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200924 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200925 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200926 "Cannot modify a string having more than 1 reference");
927 return -1;
928 }
929 _PyUnicode_DIRTY(unicode);
930 return 0;
931}
932
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935 PyObject *from, Py_ssize_t from_start,
936 Py_ssize_t how_many)
937{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 unsigned int from_kind, to_kind;
939 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940
Victor Stinnerb1536152011-09-30 02:26:10 +0200941 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942 PyErr_BadInternalCall();
943 return -1;
944 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200945
946 if (PyUnicode_READY(from))
947 return -1;
948 if (PyUnicode_READY(to))
949 return -1;
950
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200951 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200952 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200953 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200954 "Cannot write %zi characters at %zi "
955 "in a string of %zi characters",
956 how_many, to_start, PyUnicode_GET_LENGTH(to));
957 return -1;
958 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200959 if (how_many == 0)
960 return 0;
961
Victor Stinnercd9950f2011-10-02 00:34:53 +0200962 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200963 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200965 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200966 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200968 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969
Victor Stinnerf42dc442011-10-02 23:33:16 +0200970 if (from_kind == to_kind
971 /* deny latin1 => ascii */
972 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
973 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200974 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200976 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 + PyUnicode_KIND_SIZE(from_kind, from_start),
978 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200980 else if (from_kind == PyUnicode_1BYTE_KIND
981 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200982 {
983 _PyUnicode_CONVERT_BYTES(
984 Py_UCS1, Py_UCS2,
985 PyUnicode_1BYTE_DATA(from) + from_start,
986 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987 PyUnicode_2BYTE_DATA(to) + to_start
988 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200989 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200990 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200991 && to_kind == PyUnicode_4BYTE_KIND)
992 {
993 _PyUnicode_CONVERT_BYTES(
994 Py_UCS1, Py_UCS4,
995 PyUnicode_1BYTE_DATA(from) + from_start,
996 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997 PyUnicode_4BYTE_DATA(to) + to_start
998 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200999 }
1000 else if (from_kind == PyUnicode_2BYTE_KIND
1001 && to_kind == PyUnicode_4BYTE_KIND)
1002 {
1003 _PyUnicode_CONVERT_BYTES(
1004 Py_UCS2, Py_UCS4,
1005 PyUnicode_2BYTE_DATA(from) + from_start,
1006 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007 PyUnicode_4BYTE_DATA(to) + to_start
1008 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001009 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001010 else {
1011 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001012
1013 /* check if max_char(from substring) <= max_char(to) */
1014 if (from_kind > to_kind
1015 /* latin1 => ascii */
Victor Stinnera3b334d2011-10-03 13:53:37 +02001016 || (PyUnicode_IS_ASCII(to)
Victor Stinnerf42dc442011-10-02 23:33:16 +02001017 && to_kind == PyUnicode_1BYTE_KIND
Victor Stinnera3b334d2011-10-03 13:53:37 +02001018 && !PyUnicode_IS_ASCII(from)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001019 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001020 /* slow path to check for character overflow */
1021 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1022 Py_UCS4 ch, maxchar;
1023 Py_ssize_t i;
1024
1025 maxchar = 0;
1026 invalid_kinds = 0;
1027 for (i=0; i < how_many; i++) {
1028 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1029 if (ch > maxchar) {
1030 maxchar = ch;
1031 if (maxchar > to_maxchar) {
1032 invalid_kinds = 1;
1033 break;
1034 }
1035 }
1036 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1037 }
1038 }
1039 else
1040 invalid_kinds = 1;
1041 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001042 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001043 "Cannot copy %s characters "
1044 "into a string of %s characters",
1045 unicode_kind_name(from),
1046 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001047 return -1;
1048 }
1049 }
1050 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051}
1052
Victor Stinner17222162011-09-28 22:15:37 +02001053/* Find the maximum code point and count the number of surrogate pairs so a
1054 correct string length can be computed before converting a string to UCS4.
1055 This function counts single surrogates as a character and not as a pair.
1056
1057 Return 0 on success, or -1 on error. */
1058static int
1059find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1060 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061{
1062 const wchar_t *iter;
1063
Victor Stinnerc53be962011-10-02 21:33:54 +02001064 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 if (num_surrogates == NULL || maxchar == NULL) {
1066 PyErr_SetString(PyExc_SystemError,
1067 "unexpected NULL arguments to "
1068 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1069 return -1;
1070 }
1071
1072 *num_surrogates = 0;
1073 *maxchar = 0;
1074
1075 for (iter = begin; iter < end; ) {
1076 if (*iter > *maxchar)
1077 *maxchar = *iter;
1078#if SIZEOF_WCHAR_T == 2
1079 if (*iter >= 0xD800 && *iter <= 0xDBFF
1080 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1081 {
1082 Py_UCS4 surrogate_val;
1083 surrogate_val = (((iter[0] & 0x3FF)<<10)
1084 | (iter[1] & 0x3FF)) + 0x10000;
1085 ++(*num_surrogates);
1086 if (surrogate_val > *maxchar)
1087 *maxchar = surrogate_val;
1088 iter += 2;
1089 }
1090 else
1091 iter++;
1092#else
1093 iter++;
1094#endif
1095 }
1096 return 0;
1097}
1098
1099#ifdef Py_DEBUG
1100int unicode_ready_calls = 0;
1101#endif
1102
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001103static int
1104unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001106 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 wchar_t *end;
1108 Py_UCS4 maxchar = 0;
1109 Py_ssize_t num_surrogates;
1110#if SIZEOF_WCHAR_T == 2
1111 Py_ssize_t length_wo_surrogates;
1112#endif
1113
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001114 assert(p_obj != NULL);
1115 unicode = (PyUnicodeObject *)*p_obj;
1116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001118 strings were created using _PyObject_New() and where no canonical
1119 representation (the str field) has been set yet aka strings
1120 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001121 assert(_PyUnicode_CHECK(unicode));
1122 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001124 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001126 /* Actually, it should neither be interned nor be anything else: */
1127 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128
1129#ifdef Py_DEBUG
1130 ++unicode_ready_calls;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133#ifdef Py_DEBUG
1134 assert(!replace || Py_REFCNT(unicode) == 1);
1135#else
1136 if (replace && Py_REFCNT(unicode) != 1)
1137 replace = 0;
1138#endif
1139 if (replace) {
1140 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1141 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1142 /* Optimization for empty strings */
1143 if (len == 0) {
1144 Py_INCREF(unicode_empty);
1145 Py_DECREF(*p_obj);
1146 *p_obj = unicode_empty;
1147 return 0;
1148 }
1149 if (len == 1 && wstr[0] < 256) {
1150 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1151 if (latin1_char == NULL)
1152 return -1;
1153 Py_DECREF(*p_obj);
1154 *p_obj = latin1_char;
1155 return 0;
1156 }
1157 }
1158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001160 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001161 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163
1164 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001165 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1166 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 PyErr_NoMemory();
1168 return -1;
1169 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001170 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 _PyUnicode_WSTR(unicode), end,
1172 PyUnicode_1BYTE_DATA(unicode));
1173 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1174 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1175 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1176 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001177 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001179 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
1181 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001182 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001185 }
1186 PyObject_FREE(_PyUnicode_WSTR(unicode));
1187 _PyUnicode_WSTR(unicode) = NULL;
1188 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1189 }
1190 /* In this case we might have to convert down from 4-byte native
1191 wchar_t to 2-byte unicode. */
1192 else if (maxchar < 65536) {
1193 assert(num_surrogates == 0 &&
1194 "FindMaxCharAndNumSurrogatePairs() messed up");
1195
Victor Stinner506f5922011-09-28 22:34:18 +02001196#if SIZEOF_WCHAR_T == 2
1197 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001198 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001199 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1200 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1201 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001204#else
1205 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001206 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001207 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001208 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001209 PyErr_NoMemory();
1210 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 }
Victor Stinner506f5922011-09-28 22:34:18 +02001212 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1213 _PyUnicode_WSTR(unicode), end,
1214 PyUnicode_2BYTE_DATA(unicode));
1215 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1216 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1217 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001218 _PyUnicode_UTF8(unicode) = NULL;
1219 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001220 PyObject_FREE(_PyUnicode_WSTR(unicode));
1221 _PyUnicode_WSTR(unicode) = NULL;
1222 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1223#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 }
1225 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1226 else {
1227#if SIZEOF_WCHAR_T == 2
1228 /* in case the native representation is 2-bytes, we need to allocate a
1229 new normalized 4-byte version. */
1230 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001231 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1232 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 PyErr_NoMemory();
1234 return -1;
1235 }
1236 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1237 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001238 _PyUnicode_UTF8(unicode) = NULL;
1239 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001240 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1241 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001242 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 PyObject_FREE(_PyUnicode_WSTR(unicode));
1244 _PyUnicode_WSTR(unicode) = NULL;
1245 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1246#else
1247 assert(num_surrogates == 0);
1248
Victor Stinnerc3c74152011-10-02 20:39:55 +02001249 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 _PyUnicode_UTF8(unicode) = NULL;
1252 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1254#endif
1255 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1256 }
1257 _PyUnicode_STATE(unicode).ready = 1;
1258 return 0;
1259}
1260
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001261int
1262_PyUnicode_ReadyReplace(PyObject **op)
1263{
1264 return unicode_ready(op, 1);
1265}
1266
1267int
1268_PyUnicode_Ready(PyObject *op)
1269{
1270 return unicode_ready(&op, 0);
1271}
1272
Alexander Belopolsky40018472011-02-26 01:02:56 +00001273static void
1274unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001275{
Walter Dörwald16807132007-05-25 13:52:07 +00001276 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001277 case SSTATE_NOT_INTERNED:
1278 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001279
Benjamin Peterson29060642009-01-31 22:14:21 +00001280 case SSTATE_INTERNED_MORTAL:
1281 /* revive dead object temporarily for DelItem */
1282 Py_REFCNT(unicode) = 3;
1283 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1284 Py_FatalError(
1285 "deletion of interned string failed");
1286 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001287
Benjamin Peterson29060642009-01-31 22:14:21 +00001288 case SSTATE_INTERNED_IMMORTAL:
1289 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001290
Benjamin Peterson29060642009-01-31 22:14:21 +00001291 default:
1292 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001293 }
1294
Victor Stinner03490912011-10-03 23:45:12 +02001295 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001297 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001298 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299
1300 if (PyUnicode_IS_COMPACT(unicode)) {
1301 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 }
1303 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001304 if (_PyUnicode_DATA_ANY(unicode))
1305 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001306 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 }
1308}
1309
Alexander Belopolsky40018472011-02-26 01:02:56 +00001310static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001311unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001312{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001313 if (Py_REFCNT(unicode) != 1)
1314 return 0;
1315 if (PyUnicode_CHECK_INTERNED(unicode))
1316 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001317 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001318#ifdef Py_DEBUG
1319 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1320 && PyUnicode_GET_LENGTH(unicode) == 1)
1321 {
1322 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001323 if (ch < 256 && unicode_latin1[ch] == unicode)
1324 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001325 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001326#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001327 return 1;
1328}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001329
Victor Stinnerfe226c02011-10-03 03:52:20 +02001330static int
1331unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1332{
1333 PyObject *unicode;
1334 Py_ssize_t old_length;
1335
1336 assert(p_unicode != NULL);
1337 unicode = *p_unicode;
1338
1339 assert(unicode != NULL);
1340 assert(PyUnicode_Check(unicode));
1341 assert(0 <= length);
1342
Victor Stinner910337b2011-10-03 03:20:16 +02001343 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001344 old_length = PyUnicode_WSTR_LENGTH(unicode);
1345 else
1346 old_length = PyUnicode_GET_LENGTH(unicode);
1347 if (old_length == length)
1348 return 0;
1349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350 if (!unicode_resizable(unicode)) {
1351 PyObject *copy = resize_copy(unicode, length);
1352 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001354 Py_DECREF(*p_unicode);
1355 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001356 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001357 }
1358
Victor Stinnerfe226c02011-10-03 03:52:20 +02001359 if (PyUnicode_IS_COMPACT(unicode)) {
1360 *p_unicode = resize_compact(unicode, length);
1361 if (*p_unicode == NULL)
1362 return -1;
Benjamin Petersonccc51c12011-10-03 19:34:12 -04001363 _PyUnicode_CheckConsistency(*p_unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001365 }
1366 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001367}
1368
Alexander Belopolsky40018472011-02-26 01:02:56 +00001369int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001371{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001372 PyObject *unicode;
1373 if (p_unicode == NULL) {
1374 PyErr_BadInternalCall();
1375 return -1;
1376 }
1377 unicode = *p_unicode;
1378 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1379 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1380 {
1381 PyErr_BadInternalCall();
1382 return -1;
1383 }
1384 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001385}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387static PyObject*
1388get_latin1_char(unsigned char ch)
1389{
Victor Stinnera464fc12011-10-02 20:39:30 +02001390 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001392 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 if (!unicode)
1394 return NULL;
1395 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1396 unicode_latin1[ch] = unicode;
1397 }
1398 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001399 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Alexander Belopolsky40018472011-02-26 01:02:56 +00001402PyObject *
1403PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404{
1405 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 Py_UCS4 maxchar = 0;
1407 Py_ssize_t num_surrogates;
1408
1409 if (u == NULL)
1410 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001412 /* If the Unicode data is known at construction time, we can apply
1413 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 /* Optimization for empty strings */
1416 if (size == 0 && unicode_empty != NULL) {
1417 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001418 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001419 }
Tim Petersced69f82003-09-16 20:30:58 +00001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 /* Single character Unicode objects in the Latin-1 range are
1422 shared when using this constructor */
1423 if (size == 1 && *u < 256)
1424 return get_latin1_char((unsigned char)*u);
1425
1426 /* If not empty and not single character, copy the Unicode data
1427 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001428 if (find_maxchar_surrogates(u, u + size,
1429 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 return NULL;
1431
1432 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1433 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001434 if (!unicode)
1435 return NULL;
1436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 switch (PyUnicode_KIND(unicode)) {
1438 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001439 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1441 break;
1442 case PyUnicode_2BYTE_KIND:
1443#if Py_UNICODE_SIZE == 2
1444 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1445#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001446 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1448#endif
1449 break;
1450 case PyUnicode_4BYTE_KIND:
1451#if SIZEOF_WCHAR_T == 2
1452 /* This is the only case which has to process surrogates, thus
1453 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001454 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455#else
1456 assert(num_surrogates == 0);
1457 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1458#endif
1459 break;
1460 default:
1461 assert(0 && "Impossible state");
1462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463
1464 return (PyObject *)unicode;
1465}
1466
Alexander Belopolsky40018472011-02-26 01:02:56 +00001467PyObject *
1468PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001469{
1470 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001471
Benjamin Peterson14339b62009-01-31 16:36:08 +00001472 if (size < 0) {
1473 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001474 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001475 return NULL;
1476 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001477
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001478 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001479 some optimizations which share commonly used objects.
1480 Also, this means the input must be UTF-8, so fall back to the
1481 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001482 if (u != NULL) {
1483
Benjamin Peterson29060642009-01-31 22:14:21 +00001484 /* Optimization for empty strings */
1485 if (size == 0 && unicode_empty != NULL) {
1486 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001487 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001488 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001489
1490 /* Single characters are shared when using this constructor.
1491 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 if (size == 1 && Py_CHARMASK(*u) < 128)
1493 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001494
1495 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001496 }
1497
Walter Dörwald55507312007-05-18 13:12:10 +00001498 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001499 if (!unicode)
1500 return NULL;
1501
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001502 return (PyObject *)unicode;
1503}
1504
Alexander Belopolsky40018472011-02-26 01:02:56 +00001505PyObject *
1506PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001507{
1508 size_t size = strlen(u);
1509 if (size > PY_SSIZE_T_MAX) {
1510 PyErr_SetString(PyExc_OverflowError, "input too long");
1511 return NULL;
1512 }
1513
1514 return PyUnicode_FromStringAndSize(u, size);
1515}
1516
Victor Stinnere57b1c02011-09-28 22:20:48 +02001517static PyObject*
Victor Stinner702c7342011-10-05 13:50:52 +02001518unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1519{
1520 PyObject *res = PyUnicode_New(size, 127);
1521 if (!res)
1522 return NULL;
1523 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1524 return res;
1525}
1526
1527static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001528_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 PyObject *res;
1531 unsigned char max = 127;
1532 Py_ssize_t i;
1533 for (i = 0; i < size; i++) {
1534 if (u[i] & 0x80) {
1535 max = 255;
1536 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001537 }
1538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539 res = PyUnicode_New(size, max);
1540 if (!res)
1541 return NULL;
1542 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1543 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001544}
1545
Victor Stinnere57b1c02011-09-28 22:20:48 +02001546static PyObject*
1547_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001548{
1549 PyObject *res;
1550 Py_UCS2 max = 0;
1551 Py_ssize_t i;
1552 for (i = 0; i < size; i++)
1553 if (u[i] > max)
1554 max = u[i];
1555 res = PyUnicode_New(size, max);
1556 if (!res)
1557 return NULL;
1558 if (max >= 256)
1559 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1560 else
1561 for (i = 0; i < size; i++)
1562 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1563 return res;
1564}
1565
Victor Stinnere57b1c02011-09-28 22:20:48 +02001566static PyObject*
1567_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568{
1569 PyObject *res;
1570 Py_UCS4 max = 0;
1571 Py_ssize_t i;
1572 for (i = 0; i < size; i++)
1573 if (u[i] > max)
1574 max = u[i];
1575 res = PyUnicode_New(size, max);
1576 if (!res)
1577 return NULL;
1578 if (max >= 0x10000)
1579 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1580 else {
1581 int kind = PyUnicode_KIND(res);
1582 void *data = PyUnicode_DATA(res);
1583 for (i = 0; i < size; i++)
1584 PyUnicode_WRITE(kind, data, i, u[i]);
1585 }
1586 return res;
1587}
1588
1589PyObject*
1590PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1591{
1592 switch(kind) {
1593 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001594 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001596 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001598 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 }
Victor Stinner01698042011-10-04 00:04:26 +02001600 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601 return NULL;
1602}
1603
Victor Stinner034f6cf2011-09-30 02:26:44 +02001604PyObject*
1605PyUnicode_Copy(PyObject *unicode)
1606{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001607 Py_ssize_t size;
1608 PyObject *copy;
1609 void *data;
1610
Victor Stinner034f6cf2011-09-30 02:26:44 +02001611 if (!PyUnicode_Check(unicode)) {
1612 PyErr_BadInternalCall();
1613 return NULL;
1614 }
1615 if (PyUnicode_READY(unicode))
1616 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001617
1618 size = PyUnicode_GET_LENGTH(unicode);
1619 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1620 if (!copy)
1621 return NULL;
1622 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1623
1624 data = PyUnicode_DATA(unicode);
1625 switch (PyUnicode_KIND(unicode))
1626 {
1627 case PyUnicode_1BYTE_KIND:
1628 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1629 break;
1630 case PyUnicode_2BYTE_KIND:
1631 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1632 break;
1633 case PyUnicode_4BYTE_KIND:
1634 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1635 break;
1636 default:
1637 assert(0);
1638 break;
1639 }
1640 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001641}
1642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001643
Victor Stinnerbc603d12011-10-02 01:00:40 +02001644/* Widen Unicode objects to larger buffers. Don't write terminating null
1645 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646
1647void*
1648_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1649{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001650 Py_ssize_t len;
1651 void *result;
1652 unsigned int skind;
1653
1654 if (PyUnicode_READY(s))
1655 return NULL;
1656
1657 len = PyUnicode_GET_LENGTH(s);
1658 skind = PyUnicode_KIND(s);
1659 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001660 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 return NULL;
1662 }
1663 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001664 case PyUnicode_2BYTE_KIND:
1665 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1666 if (!result)
1667 return PyErr_NoMemory();
1668 assert(skind == PyUnicode_1BYTE_KIND);
1669 _PyUnicode_CONVERT_BYTES(
1670 Py_UCS1, Py_UCS2,
1671 PyUnicode_1BYTE_DATA(s),
1672 PyUnicode_1BYTE_DATA(s) + len,
1673 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001675 case PyUnicode_4BYTE_KIND:
1676 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1677 if (!result)
1678 return PyErr_NoMemory();
1679 if (skind == PyUnicode_2BYTE_KIND) {
1680 _PyUnicode_CONVERT_BYTES(
1681 Py_UCS2, Py_UCS4,
1682 PyUnicode_2BYTE_DATA(s),
1683 PyUnicode_2BYTE_DATA(s) + len,
1684 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001686 else {
1687 assert(skind == PyUnicode_1BYTE_KIND);
1688 _PyUnicode_CONVERT_BYTES(
1689 Py_UCS1, Py_UCS4,
1690 PyUnicode_1BYTE_DATA(s),
1691 PyUnicode_1BYTE_DATA(s) + len,
1692 result);
1693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001695 default:
1696 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 }
Victor Stinner01698042011-10-04 00:04:26 +02001698 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 return NULL;
1700}
1701
1702static Py_UCS4*
1703as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1704 int copy_null)
1705{
1706 int kind;
1707 void *data;
1708 Py_ssize_t len, targetlen;
1709 if (PyUnicode_READY(string) == -1)
1710 return NULL;
1711 kind = PyUnicode_KIND(string);
1712 data = PyUnicode_DATA(string);
1713 len = PyUnicode_GET_LENGTH(string);
1714 targetlen = len;
1715 if (copy_null)
1716 targetlen++;
1717 if (!target) {
1718 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1719 PyErr_NoMemory();
1720 return NULL;
1721 }
1722 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1723 if (!target) {
1724 PyErr_NoMemory();
1725 return NULL;
1726 }
1727 }
1728 else {
1729 if (targetsize < targetlen) {
1730 PyErr_Format(PyExc_SystemError,
1731 "string is longer than the buffer");
1732 if (copy_null && 0 < targetsize)
1733 target[0] = 0;
1734 return NULL;
1735 }
1736 }
1737 if (kind != PyUnicode_4BYTE_KIND) {
1738 Py_ssize_t i;
1739 for (i = 0; i < len; i++)
1740 target[i] = PyUnicode_READ(kind, data, i);
1741 }
1742 else
1743 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1744 if (copy_null)
1745 target[len] = 0;
1746 return target;
1747}
1748
1749Py_UCS4*
1750PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1751 int copy_null)
1752{
1753 if (target == NULL || targetsize < 1) {
1754 PyErr_BadInternalCall();
1755 return NULL;
1756 }
1757 return as_ucs4(string, target, targetsize, copy_null);
1758}
1759
1760Py_UCS4*
1761PyUnicode_AsUCS4Copy(PyObject *string)
1762{
1763 return as_ucs4(string, NULL, 0, 1);
1764}
1765
1766#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001767
Alexander Belopolsky40018472011-02-26 01:02:56 +00001768PyObject *
1769PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001772 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001774 PyErr_BadInternalCall();
1775 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 }
1777
Martin v. Löwis790465f2008-04-05 20:41:37 +00001778 if (size == -1) {
1779 size = wcslen(w);
1780 }
1781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001782 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001783}
1784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001786
Walter Dörwald346737f2007-05-31 10:44:43 +00001787static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001788makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1789 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 *fmt++ = '%';
1792 if (width) {
1793 if (zeropad)
1794 *fmt++ = '0';
1795 fmt += sprintf(fmt, "%d", width);
1796 }
1797 if (precision)
1798 fmt += sprintf(fmt, ".%d", precision);
1799 if (longflag)
1800 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001801 else if (longlongflag) {
1802 /* longlongflag should only ever be nonzero on machines with
1803 HAVE_LONG_LONG defined */
1804#ifdef HAVE_LONG_LONG
1805 char *f = PY_FORMAT_LONG_LONG;
1806 while (*f)
1807 *fmt++ = *f++;
1808#else
1809 /* we shouldn't ever get here */
1810 assert(0);
1811 *fmt++ = 'l';
1812#endif
1813 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001814 else if (size_tflag) {
1815 char *f = PY_FORMAT_SIZE_T;
1816 while (*f)
1817 *fmt++ = *f++;
1818 }
1819 *fmt++ = c;
1820 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001821}
1822
Victor Stinner96865452011-03-01 23:44:09 +00001823/* helper for PyUnicode_FromFormatV() */
1824
1825static const char*
1826parse_format_flags(const char *f,
1827 int *p_width, int *p_precision,
1828 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1829{
1830 int width, precision, longflag, longlongflag, size_tflag;
1831
1832 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1833 f++;
1834 width = 0;
1835 while (Py_ISDIGIT((unsigned)*f))
1836 width = (width*10) + *f++ - '0';
1837 precision = 0;
1838 if (*f == '.') {
1839 f++;
1840 while (Py_ISDIGIT((unsigned)*f))
1841 precision = (precision*10) + *f++ - '0';
1842 if (*f == '%') {
1843 /* "%.3%s" => f points to "3" */
1844 f--;
1845 }
1846 }
1847 if (*f == '\0') {
1848 /* bogus format "%.1" => go backward, f points to "1" */
1849 f--;
1850 }
1851 if (p_width != NULL)
1852 *p_width = width;
1853 if (p_precision != NULL)
1854 *p_precision = precision;
1855
1856 /* Handle %ld, %lu, %lld and %llu. */
1857 longflag = 0;
1858 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001859 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001860
1861 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001862 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001863 longflag = 1;
1864 ++f;
1865 }
1866#ifdef HAVE_LONG_LONG
1867 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001868 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001869 longlongflag = 1;
1870 f += 2;
1871 }
1872#endif
1873 }
1874 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001875 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001876 size_tflag = 1;
1877 ++f;
1878 }
1879 if (p_longflag != NULL)
1880 *p_longflag = longflag;
1881 if (p_longlongflag != NULL)
1882 *p_longlongflag = longlongflag;
1883 if (p_size_tflag != NULL)
1884 *p_size_tflag = size_tflag;
1885 return f;
1886}
1887
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001888/* maximum number of characters required for output of %ld. 21 characters
1889 allows for 64-bit integers (in decimal) and an optional sign. */
1890#define MAX_LONG_CHARS 21
1891/* maximum number of characters required for output of %lld.
1892 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1893 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1894#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1895
Walter Dörwaldd2034312007-05-18 16:29:38 +00001896PyObject *
1897PyUnicode_FromFormatV(const char *format, va_list vargs)
1898{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001899 va_list count;
1900 Py_ssize_t callcount = 0;
1901 PyObject **callresults = NULL;
1902 PyObject **callresult = NULL;
1903 Py_ssize_t n = 0;
1904 int width = 0;
1905 int precision = 0;
1906 int zeropad;
1907 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001909 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001910 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1912 Py_UCS4 argmaxchar;
1913 Py_ssize_t numbersize = 0;
1914 char *numberresults = NULL;
1915 char *numberresult = NULL;
1916 Py_ssize_t i;
1917 int kind;
1918 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001919
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001920 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001921 /* step 1: count the number of %S/%R/%A/%s format specifications
1922 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1923 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924 * result in an array)
1925 * also esimate a upper bound for all the number formats in the string,
1926 * numbers will be formated in step 3 and be keept in a '\0'-separated
1927 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001928 for (f = format; *f; f++) {
1929 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001930 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1932 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1933 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1934 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001937#ifdef HAVE_LONG_LONG
1938 if (longlongflag) {
1939 if (width < MAX_LONG_LONG_CHARS)
1940 width = MAX_LONG_LONG_CHARS;
1941 }
1942 else
1943#endif
1944 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1945 including sign. Decimal takes the most space. This
1946 isn't enough for octal. If a width is specified we
1947 need more (which we allocate later). */
1948 if (width < MAX_LONG_CHARS)
1949 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950
1951 /* account for the size + '\0' to separate numbers
1952 inside of the numberresults buffer */
1953 numbersize += (width + 1);
1954 }
1955 }
1956 else if ((unsigned char)*f > 127) {
1957 PyErr_Format(PyExc_ValueError,
1958 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1959 "string, got a non-ASCII byte: 0x%02x",
1960 (unsigned char)*f);
1961 return NULL;
1962 }
1963 }
1964 /* step 2: allocate memory for the results of
1965 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1966 if (callcount) {
1967 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1968 if (!callresults) {
1969 PyErr_NoMemory();
1970 return NULL;
1971 }
1972 callresult = callresults;
1973 }
1974 /* step 2.5: allocate memory for the results of formating numbers */
1975 if (numbersize) {
1976 numberresults = PyObject_Malloc(numbersize);
1977 if (!numberresults) {
1978 PyErr_NoMemory();
1979 goto fail;
1980 }
1981 numberresult = numberresults;
1982 }
1983
1984 /* step 3: format numbers and figure out how large a buffer we need */
1985 for (f = format; *f; f++) {
1986 if (*f == '%') {
1987 const char* p;
1988 int longflag;
1989 int longlongflag;
1990 int size_tflag;
1991 int numprinted;
1992
1993 p = f;
1994 zeropad = (f[1] == '0');
1995 f = parse_format_flags(f, &width, &precision,
1996 &longflag, &longlongflag, &size_tflag);
1997 switch (*f) {
1998 case 'c':
1999 {
2000 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002001 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 n++;
2003 break;
2004 }
2005 case '%':
2006 n++;
2007 break;
2008 case 'i':
2009 case 'd':
2010 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2011 width, precision, *f);
2012 if (longflag)
2013 numprinted = sprintf(numberresult, fmt,
2014 va_arg(count, long));
2015#ifdef HAVE_LONG_LONG
2016 else if (longlongflag)
2017 numprinted = sprintf(numberresult, fmt,
2018 va_arg(count, PY_LONG_LONG));
2019#endif
2020 else if (size_tflag)
2021 numprinted = sprintf(numberresult, fmt,
2022 va_arg(count, Py_ssize_t));
2023 else
2024 numprinted = sprintf(numberresult, fmt,
2025 va_arg(count, int));
2026 n += numprinted;
2027 /* advance by +1 to skip over the '\0' */
2028 numberresult += (numprinted + 1);
2029 assert(*(numberresult - 1) == '\0');
2030 assert(*(numberresult - 2) != '\0');
2031 assert(numprinted >= 0);
2032 assert(numberresult <= numberresults + numbersize);
2033 break;
2034 case 'u':
2035 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2036 width, precision, 'u');
2037 if (longflag)
2038 numprinted = sprintf(numberresult, fmt,
2039 va_arg(count, unsigned long));
2040#ifdef HAVE_LONG_LONG
2041 else if (longlongflag)
2042 numprinted = sprintf(numberresult, fmt,
2043 va_arg(count, unsigned PY_LONG_LONG));
2044#endif
2045 else if (size_tflag)
2046 numprinted = sprintf(numberresult, fmt,
2047 va_arg(count, size_t));
2048 else
2049 numprinted = sprintf(numberresult, fmt,
2050 va_arg(count, unsigned int));
2051 n += numprinted;
2052 numberresult += (numprinted + 1);
2053 assert(*(numberresult - 1) == '\0');
2054 assert(*(numberresult - 2) != '\0');
2055 assert(numprinted >= 0);
2056 assert(numberresult <= numberresults + numbersize);
2057 break;
2058 case 'x':
2059 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2060 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2061 n += numprinted;
2062 numberresult += (numprinted + 1);
2063 assert(*(numberresult - 1) == '\0');
2064 assert(*(numberresult - 2) != '\0');
2065 assert(numprinted >= 0);
2066 assert(numberresult <= numberresults + numbersize);
2067 break;
2068 case 'p':
2069 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2070 /* %p is ill-defined: ensure leading 0x. */
2071 if (numberresult[1] == 'X')
2072 numberresult[1] = 'x';
2073 else if (numberresult[1] != 'x') {
2074 memmove(numberresult + 2, numberresult,
2075 strlen(numberresult) + 1);
2076 numberresult[0] = '0';
2077 numberresult[1] = 'x';
2078 numprinted += 2;
2079 }
2080 n += numprinted;
2081 numberresult += (numprinted + 1);
2082 assert(*(numberresult - 1) == '\0');
2083 assert(*(numberresult - 2) != '\0');
2084 assert(numprinted >= 0);
2085 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002086 break;
2087 case 's':
2088 {
2089 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002090 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002091 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2092 if (!str)
2093 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002094 /* since PyUnicode_DecodeUTF8 returns already flexible
2095 unicode objects, there is no need to call ready on them */
2096 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002097 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002099 /* Remember the str and switch to the next slot */
2100 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002101 break;
2102 }
2103 case 'U':
2104 {
2105 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002106 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 if (PyUnicode_READY(obj) == -1)
2108 goto fail;
2109 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002110 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002112 break;
2113 }
2114 case 'V':
2115 {
2116 PyObject *obj = va_arg(count, PyObject *);
2117 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002118 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002119 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002120 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002121 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 if (PyUnicode_READY(obj) == -1)
2123 goto fail;
2124 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002125 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002127 *callresult++ = NULL;
2128 }
2129 else {
2130 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2131 if (!str_obj)
2132 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002133 if (PyUnicode_READY(str_obj)) {
2134 Py_DECREF(str_obj);
2135 goto fail;
2136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002138 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002140 *callresult++ = str_obj;
2141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002142 break;
2143 }
2144 case 'S':
2145 {
2146 PyObject *obj = va_arg(count, PyObject *);
2147 PyObject *str;
2148 assert(obj);
2149 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002151 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002153 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 /* Remember the str and switch to the next slot */
2156 *callresult++ = str;
2157 break;
2158 }
2159 case 'R':
2160 {
2161 PyObject *obj = va_arg(count, PyObject *);
2162 PyObject *repr;
2163 assert(obj);
2164 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002165 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002166 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002168 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 /* Remember the repr and switch to the next slot */
2171 *callresult++ = repr;
2172 break;
2173 }
2174 case 'A':
2175 {
2176 PyObject *obj = va_arg(count, PyObject *);
2177 PyObject *ascii;
2178 assert(obj);
2179 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002180 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002183 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 /* Remember the repr and switch to the next slot */
2186 *callresult++ = ascii;
2187 break;
2188 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 default:
2190 /* if we stumble upon an unknown
2191 formatting code, copy the rest of
2192 the format string to the output
2193 string. (we cannot just skip the
2194 code, since there's no way to know
2195 what's in the argument list) */
2196 n += strlen(p);
2197 goto expand;
2198 }
2199 } else
2200 n++;
2201 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002202 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002205 we don't have to resize the string.
2206 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 if (!string)
2209 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 kind = PyUnicode_KIND(string);
2211 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002216 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002217 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002218
2219 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2221 /* checking for == because the last argument could be a empty
2222 string, which causes i to point to end, the assert at the end of
2223 the loop */
2224 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002225
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 switch (*f) {
2227 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002228 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 const int ordinal = va_arg(vargs, int);
2230 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002231 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002232 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002233 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002235 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 case 'p':
2238 /* unused, since we already have the result */
2239 if (*f == 'p')
2240 (void) va_arg(vargs, void *);
2241 else
2242 (void) va_arg(vargs, int);
2243 /* extract the result from numberresults and append. */
2244 for (; *numberresult; ++i, ++numberresult)
2245 PyUnicode_WRITE(kind, data, i, *numberresult);
2246 /* skip over the separating '\0' */
2247 assert(*numberresult == '\0');
2248 numberresult++;
2249 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002250 break;
2251 case 's':
2252 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002253 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002255 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 size = PyUnicode_GET_LENGTH(*callresult);
2257 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002258 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2259 *callresult, 0,
2260 size) < 0)
2261 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002263 /* We're done with the unicode()/repr() => forget it */
2264 Py_DECREF(*callresult);
2265 /* switch to next unicode()/repr() result */
2266 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 break;
2268 }
2269 case 'U':
2270 {
2271 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 Py_ssize_t size;
2273 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2274 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002275 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2276 obj, 0,
2277 size) < 0)
2278 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002280 break;
2281 }
2282 case 'V':
2283 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002286 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002287 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 size = PyUnicode_GET_LENGTH(obj);
2289 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002290 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2291 obj, 0,
2292 size) < 0)
2293 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002294 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296 size = PyUnicode_GET_LENGTH(*callresult);
2297 assert(PyUnicode_KIND(*callresult) <=
2298 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002299 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2300 *callresult,
2301 0, size) < 0)
2302 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002304 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002305 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002306 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 break;
2308 }
2309 case 'S':
2310 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002311 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002312 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002313 /* unused, since we already have the result */
2314 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002316 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2317 *callresult, 0,
2318 PyUnicode_GET_LENGTH(*callresult)) < 0)
2319 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002321 /* We're done with the unicode()/repr() => forget it */
2322 Py_DECREF(*callresult);
2323 /* switch to next unicode()/repr() result */
2324 ++callresult;
2325 break;
2326 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002327 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002329 break;
2330 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331 for (; *p; ++p, ++i)
2332 PyUnicode_WRITE(kind, data, i, *p);
2333 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002334 goto end;
2335 }
Victor Stinner1205f272010-09-11 00:54:47 +00002336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 else {
2338 assert(i < PyUnicode_GET_LENGTH(string));
2339 PyUnicode_WRITE(kind, data, i++, *f);
2340 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002342 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002343
Benjamin Peterson29060642009-01-31 22:14:21 +00002344 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002345 if (callresults)
2346 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002347 if (numberresults)
2348 PyObject_Free(numberresults);
2349 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002350 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002351 if (callresults) {
2352 PyObject **callresult2 = callresults;
2353 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002354 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002355 ++callresult2;
2356 }
2357 PyObject_Free(callresults);
2358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 if (numberresults)
2360 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002362}
2363
Walter Dörwaldd2034312007-05-18 16:29:38 +00002364PyObject *
2365PyUnicode_FromFormat(const char *format, ...)
2366{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002367 PyObject* ret;
2368 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002369
2370#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002372#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002374#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 ret = PyUnicode_FromFormatV(format, vargs);
2376 va_end(vargs);
2377 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002378}
2379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380#ifdef HAVE_WCHAR_H
2381
Victor Stinner5593d8a2010-10-02 11:11:27 +00002382/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2383 convert a Unicode object to a wide character string.
2384
Victor Stinnerd88d9832011-09-06 02:00:05 +02002385 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002386 character) required to convert the unicode object. Ignore size argument.
2387
Victor Stinnerd88d9832011-09-06 02:00:05 +02002388 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002389 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002390 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002391static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002392unicode_aswidechar(PyUnicodeObject *unicode,
2393 wchar_t *w,
2394 Py_ssize_t size)
2395{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002396 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 const wchar_t *wstr;
2398
2399 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2400 if (wstr == NULL)
2401 return -1;
2402
Victor Stinner5593d8a2010-10-02 11:11:27 +00002403 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002404 if (size > res)
2405 size = res + 1;
2406 else
2407 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002409 return res;
2410 }
2411 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002413}
2414
2415Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002416PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002417 wchar_t *w,
2418 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419{
2420 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 PyErr_BadInternalCall();
2422 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002424 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425}
2426
Victor Stinner137c34c2010-09-29 10:25:54 +00002427wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002428PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002429 Py_ssize_t *size)
2430{
2431 wchar_t* buffer;
2432 Py_ssize_t buflen;
2433
2434 if (unicode == NULL) {
2435 PyErr_BadInternalCall();
2436 return NULL;
2437 }
2438
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002439 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 if (buflen == -1)
2441 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002442 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002443 PyErr_NoMemory();
2444 return NULL;
2445 }
2446
Victor Stinner137c34c2010-09-29 10:25:54 +00002447 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2448 if (buffer == NULL) {
2449 PyErr_NoMemory();
2450 return NULL;
2451 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002452 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 if (buflen == -1)
2454 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002455 if (size != NULL)
2456 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002457 return buffer;
2458}
2459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002460#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461
Alexander Belopolsky40018472011-02-26 01:02:56 +00002462PyObject *
2463PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002464{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002466 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002467 PyErr_SetString(PyExc_ValueError,
2468 "chr() arg not in range(0x110000)");
2469 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002470 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 if (ordinal < 256)
2473 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 v = PyUnicode_New(1, ordinal);
2476 if (v == NULL)
2477 return NULL;
2478 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2479 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002480}
2481
Alexander Belopolsky40018472011-02-26 01:02:56 +00002482PyObject *
2483PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002484{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002485 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002487 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002488 if (PyUnicode_READY(obj))
2489 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 Py_INCREF(obj);
2491 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002492 }
2493 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002494 /* For a Unicode subtype that's not a Unicode object,
2495 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002496 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002497 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002498 PyErr_Format(PyExc_TypeError,
2499 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002500 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002501 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002502}
2503
Alexander Belopolsky40018472011-02-26 01:02:56 +00002504PyObject *
2505PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002506 const char *encoding,
2507 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002508{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002509 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002510 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002511
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002513 PyErr_BadInternalCall();
2514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002516
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002517 /* Decoding bytes objects is the most common case and should be fast */
2518 if (PyBytes_Check(obj)) {
2519 if (PyBytes_GET_SIZE(obj) == 0) {
2520 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002521 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002522 }
2523 else {
2524 v = PyUnicode_Decode(
2525 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2526 encoding, errors);
2527 }
2528 return v;
2529 }
2530
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002531 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002532 PyErr_SetString(PyExc_TypeError,
2533 "decoding str is not supported");
2534 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002536
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002537 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2538 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2539 PyErr_Format(PyExc_TypeError,
2540 "coercing to str: need bytes, bytearray "
2541 "or buffer-like object, %.80s found",
2542 Py_TYPE(obj)->tp_name);
2543 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002544 }
Tim Petersced69f82003-09-16 20:30:58 +00002545
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002546 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002547 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002548 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 }
Tim Petersced69f82003-09-16 20:30:58 +00002550 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002551 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002552
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002553 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002554 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555}
2556
Victor Stinner600d3be2010-06-10 12:00:55 +00002557/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002558 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2559 1 on success. */
2560static int
2561normalize_encoding(const char *encoding,
2562 char *lower,
2563 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002565 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002566 char *l;
2567 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002568
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002569 e = encoding;
2570 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002571 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002572 while (*e) {
2573 if (l == l_end)
2574 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002575 if (Py_ISUPPER(*e)) {
2576 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002577 }
2578 else if (*e == '_') {
2579 *l++ = '-';
2580 e++;
2581 }
2582 else {
2583 *l++ = *e++;
2584 }
2585 }
2586 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002587 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002588}
2589
Alexander Belopolsky40018472011-02-26 01:02:56 +00002590PyObject *
2591PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002592 Py_ssize_t size,
2593 const char *encoding,
2594 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002595{
2596 PyObject *buffer = NULL, *unicode;
2597 Py_buffer info;
2598 char lower[11]; /* Enough for any encoding shortcut */
2599
2600 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002601 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002602
2603 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002604 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002605 if ((strcmp(lower, "utf-8") == 0) ||
2606 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002607 return PyUnicode_DecodeUTF8(s, size, errors);
2608 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002609 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002610 (strcmp(lower, "iso-8859-1") == 0))
2611 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002612#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002613 else if (strcmp(lower, "mbcs") == 0)
2614 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002615#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002616 else if (strcmp(lower, "ascii") == 0)
2617 return PyUnicode_DecodeASCII(s, size, errors);
2618 else if (strcmp(lower, "utf-16") == 0)
2619 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2620 else if (strcmp(lower, "utf-32") == 0)
2621 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623
2624 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002625 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002626 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002627 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002628 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 if (buffer == NULL)
2630 goto onError;
2631 unicode = PyCodec_Decode(buffer, encoding, errors);
2632 if (unicode == NULL)
2633 goto onError;
2634 if (!PyUnicode_Check(unicode)) {
2635 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002636 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002637 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 Py_DECREF(unicode);
2639 goto onError;
2640 }
2641 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002642#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002643 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 Py_DECREF(unicode);
2645 return NULL;
2646 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002647#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002649
Benjamin Peterson29060642009-01-31 22:14:21 +00002650 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 Py_XDECREF(buffer);
2652 return NULL;
2653}
2654
Alexander Belopolsky40018472011-02-26 01:02:56 +00002655PyObject *
2656PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002657 const char *encoding,
2658 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002659{
2660 PyObject *v;
2661
2662 if (!PyUnicode_Check(unicode)) {
2663 PyErr_BadArgument();
2664 goto onError;
2665 }
2666
2667 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002668 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002669
2670 /* Decode via the codec registry */
2671 v = PyCodec_Decode(unicode, encoding, errors);
2672 if (v == NULL)
2673 goto onError;
2674 return v;
2675
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002677 return NULL;
2678}
2679
Alexander Belopolsky40018472011-02-26 01:02:56 +00002680PyObject *
2681PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002682 const char *encoding,
2683 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002684{
2685 PyObject *v;
2686
2687 if (!PyUnicode_Check(unicode)) {
2688 PyErr_BadArgument();
2689 goto onError;
2690 }
2691
2692 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002694
2695 /* Decode via the codec registry */
2696 v = PyCodec_Decode(unicode, encoding, errors);
2697 if (v == NULL)
2698 goto onError;
2699 if (!PyUnicode_Check(v)) {
2700 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002701 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002702 Py_TYPE(v)->tp_name);
2703 Py_DECREF(v);
2704 goto onError;
2705 }
2706 return v;
2707
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002709 return NULL;
2710}
2711
Alexander Belopolsky40018472011-02-26 01:02:56 +00002712PyObject *
2713PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002714 Py_ssize_t size,
2715 const char *encoding,
2716 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717{
2718 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002719
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 unicode = PyUnicode_FromUnicode(s, size);
2721 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2724 Py_DECREF(unicode);
2725 return v;
2726}
2727
Alexander Belopolsky40018472011-02-26 01:02:56 +00002728PyObject *
2729PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002730 const char *encoding,
2731 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002732{
2733 PyObject *v;
2734
2735 if (!PyUnicode_Check(unicode)) {
2736 PyErr_BadArgument();
2737 goto onError;
2738 }
2739
2740 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002741 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002742
2743 /* Encode via the codec registry */
2744 v = PyCodec_Encode(unicode, encoding, errors);
2745 if (v == NULL)
2746 goto onError;
2747 return v;
2748
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002750 return NULL;
2751}
2752
Victor Stinnerad158722010-10-27 00:25:46 +00002753PyObject *
2754PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002755{
Victor Stinner99b95382011-07-04 14:23:54 +02002756#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002757 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2758 PyUnicode_GET_SIZE(unicode),
2759 NULL);
2760#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002762#else
Victor Stinner793b5312011-04-27 00:24:21 +02002763 PyInterpreterState *interp = PyThreadState_GET()->interp;
2764 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2765 cannot use it to encode and decode filenames before it is loaded. Load
2766 the Python codec requires to encode at least its own filename. Use the C
2767 version of the locale codec until the codec registry is initialized and
2768 the Python codec is loaded.
2769
2770 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2771 cannot only rely on it: check also interp->fscodec_initialized for
2772 subinterpreters. */
2773 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002774 return PyUnicode_AsEncodedString(unicode,
2775 Py_FileSystemDefaultEncoding,
2776 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002777 }
2778 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002779 /* locale encoding with surrogateescape */
2780 wchar_t *wchar;
2781 char *bytes;
2782 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002783 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002784
2785 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2786 if (wchar == NULL)
2787 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002788 bytes = _Py_wchar2char(wchar, &error_pos);
2789 if (bytes == NULL) {
2790 if (error_pos != (size_t)-1) {
2791 char *errmsg = strerror(errno);
2792 PyObject *exc = NULL;
2793 if (errmsg == NULL)
2794 errmsg = "Py_wchar2char() failed";
2795 raise_encode_exception(&exc,
2796 "filesystemencoding",
2797 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2798 error_pos, error_pos+1,
2799 errmsg);
2800 Py_XDECREF(exc);
2801 }
2802 else
2803 PyErr_NoMemory();
2804 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002805 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002806 }
2807 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002808
2809 bytes_obj = PyBytes_FromString(bytes);
2810 PyMem_Free(bytes);
2811 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002812 }
Victor Stinnerad158722010-10-27 00:25:46 +00002813#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002814}
2815
Alexander Belopolsky40018472011-02-26 01:02:56 +00002816PyObject *
2817PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002818 const char *encoding,
2819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820{
2821 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002822 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 if (!PyUnicode_Check(unicode)) {
2825 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827 }
Fred Drakee4315f52000-05-09 19:53:39 +00002828
Victor Stinner2f283c22011-03-02 01:21:46 +00002829 if (encoding == NULL) {
2830 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002832 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002834 }
Fred Drakee4315f52000-05-09 19:53:39 +00002835
2836 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002837 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002838 if ((strcmp(lower, "utf-8") == 0) ||
2839 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002840 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002841 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002843 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002845 }
Victor Stinner37296e82010-06-10 13:36:23 +00002846 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002847 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002848 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002849 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002850#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002851 else if (strcmp(lower, "mbcs") == 0)
2852 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2853 PyUnicode_GET_SIZE(unicode),
2854 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002855#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002856 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002857 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859
2860 /* Encode via the codec registry */
2861 v = PyCodec_Encode(unicode, encoding, errors);
2862 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002863 return NULL;
2864
2865 /* The normal path */
2866 if (PyBytes_Check(v))
2867 return v;
2868
2869 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002870 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002871 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002872 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002873
2874 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2875 "encoder %s returned bytearray instead of bytes",
2876 encoding);
2877 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002878 Py_DECREF(v);
2879 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002880 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002881
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002882 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2883 Py_DECREF(v);
2884 return b;
2885 }
2886
2887 PyErr_Format(PyExc_TypeError,
2888 "encoder did not return a bytes object (type=%.400s)",
2889 Py_TYPE(v)->tp_name);
2890 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002891 return NULL;
2892}
2893
Alexander Belopolsky40018472011-02-26 01:02:56 +00002894PyObject *
2895PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002896 const char *encoding,
2897 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002898{
2899 PyObject *v;
2900
2901 if (!PyUnicode_Check(unicode)) {
2902 PyErr_BadArgument();
2903 goto onError;
2904 }
2905
2906 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002907 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002908
2909 /* Encode via the codec registry */
2910 v = PyCodec_Encode(unicode, encoding, errors);
2911 if (v == NULL)
2912 goto onError;
2913 if (!PyUnicode_Check(v)) {
2914 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002915 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002916 Py_TYPE(v)->tp_name);
2917 Py_DECREF(v);
2918 goto onError;
2919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002921
Benjamin Peterson29060642009-01-31 22:14:21 +00002922 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 return NULL;
2924}
2925
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002926PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002927PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002928 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002929 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2930}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002931
Christian Heimes5894ba72007-11-04 11:43:14 +00002932PyObject*
2933PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2934{
Victor Stinner99b95382011-07-04 14:23:54 +02002935#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002936 return PyUnicode_DecodeMBCS(s, size, NULL);
2937#elif defined(__APPLE__)
2938 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2939#else
Victor Stinner793b5312011-04-27 00:24:21 +02002940 PyInterpreterState *interp = PyThreadState_GET()->interp;
2941 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2942 cannot use it to encode and decode filenames before it is loaded. Load
2943 the Python codec requires to encode at least its own filename. Use the C
2944 version of the locale codec until the codec registry is initialized and
2945 the Python codec is loaded.
2946
2947 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2948 cannot only rely on it: check also interp->fscodec_initialized for
2949 subinterpreters. */
2950 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002951 return PyUnicode_Decode(s, size,
2952 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002953 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002954 }
2955 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002956 /* locale encoding with surrogateescape */
2957 wchar_t *wchar;
2958 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002959 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002960
2961 if (s[size] != '\0' || size != strlen(s)) {
2962 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2963 return NULL;
2964 }
2965
Victor Stinner168e1172010-10-16 23:16:16 +00002966 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002967 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002968 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002969
Victor Stinner168e1172010-10-16 23:16:16 +00002970 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002971 PyMem_Free(wchar);
2972 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002973 }
Victor Stinnerad158722010-10-27 00:25:46 +00002974#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002975}
2976
Martin v. Löwis011e8422009-05-05 04:43:17 +00002977
2978int
2979PyUnicode_FSConverter(PyObject* arg, void* addr)
2980{
2981 PyObject *output = NULL;
2982 Py_ssize_t size;
2983 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002984 if (arg == NULL) {
2985 Py_DECREF(*(PyObject**)addr);
2986 return 1;
2987 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002988 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002989 output = arg;
2990 Py_INCREF(output);
2991 }
2992 else {
2993 arg = PyUnicode_FromObject(arg);
2994 if (!arg)
2995 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002996 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002997 Py_DECREF(arg);
2998 if (!output)
2999 return 0;
3000 if (!PyBytes_Check(output)) {
3001 Py_DECREF(output);
3002 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3003 return 0;
3004 }
3005 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003006 size = PyBytes_GET_SIZE(output);
3007 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003008 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003009 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003010 Py_DECREF(output);
3011 return 0;
3012 }
3013 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003014 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003015}
3016
3017
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003018int
3019PyUnicode_FSDecoder(PyObject* arg, void* addr)
3020{
3021 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003022 if (arg == NULL) {
3023 Py_DECREF(*(PyObject**)addr);
3024 return 1;
3025 }
3026 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027 if (PyUnicode_READY(arg))
3028 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003029 output = arg;
3030 Py_INCREF(output);
3031 }
3032 else {
3033 arg = PyBytes_FromObject(arg);
3034 if (!arg)
3035 return 0;
3036 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3037 PyBytes_GET_SIZE(arg));
3038 Py_DECREF(arg);
3039 if (!output)
3040 return 0;
3041 if (!PyUnicode_Check(output)) {
3042 Py_DECREF(output);
3043 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3044 return 0;
3045 }
3046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003047 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3048 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003049 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3050 Py_DECREF(output);
3051 return 0;
3052 }
3053 *(PyObject**)addr = output;
3054 return Py_CLEANUP_SUPPORTED;
3055}
3056
3057
Martin v. Löwis5b222132007-06-10 09:51:05 +00003058char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003059PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003060{
Christian Heimesf3863112007-11-22 07:46:41 +00003061 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003062 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3063
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003064 if (!PyUnicode_Check(unicode)) {
3065 PyErr_BadArgument();
3066 return NULL;
3067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003068 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003069 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003070
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003071 if (PyUnicode_UTF8(unicode) == NULL) {
3072 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003073 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3074 if (bytes == NULL)
3075 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003076 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3077 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003078 Py_DECREF(bytes);
3079 return NULL;
3080 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003081 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3082 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003083 Py_DECREF(bytes);
3084 }
3085
3086 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003087 *psize = PyUnicode_UTF8_LENGTH(unicode);
3088 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003089}
3090
3091char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003092PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003094 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3095}
3096
3097#ifdef Py_DEBUG
3098int unicode_as_unicode_calls = 0;
3099#endif
3100
3101
3102Py_UNICODE *
3103PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3104{
3105 PyUnicodeObject *u;
3106 const unsigned char *one_byte;
3107#if SIZEOF_WCHAR_T == 4
3108 const Py_UCS2 *two_bytes;
3109#else
3110 const Py_UCS4 *four_bytes;
3111 const Py_UCS4 *ucs4_end;
3112 Py_ssize_t num_surrogates;
3113#endif
3114 wchar_t *w;
3115 wchar_t *wchar_end;
3116
3117 if (!PyUnicode_Check(unicode)) {
3118 PyErr_BadArgument();
3119 return NULL;
3120 }
3121 u = (PyUnicodeObject*)unicode;
3122 if (_PyUnicode_WSTR(u) == NULL) {
3123 /* Non-ASCII compact unicode object */
3124 assert(_PyUnicode_KIND(u) != 0);
3125 assert(PyUnicode_IS_READY(u));
3126
3127#ifdef Py_DEBUG
3128 ++unicode_as_unicode_calls;
3129#endif
3130
3131 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3132#if SIZEOF_WCHAR_T == 2
3133 four_bytes = PyUnicode_4BYTE_DATA(u);
3134 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3135 num_surrogates = 0;
3136
3137 for (; four_bytes < ucs4_end; ++four_bytes) {
3138 if (*four_bytes > 0xFFFF)
3139 ++num_surrogates;
3140 }
3141
3142 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3143 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3144 if (!_PyUnicode_WSTR(u)) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
3148 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3149
3150 w = _PyUnicode_WSTR(u);
3151 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3152 four_bytes = PyUnicode_4BYTE_DATA(u);
3153 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3154 if (*four_bytes > 0xFFFF) {
3155 /* encode surrogate pair in this case */
3156 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3157 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3158 }
3159 else
3160 *w = *four_bytes;
3161
3162 if (w > wchar_end) {
3163 assert(0 && "Miscalculated string end");
3164 }
3165 }
3166 *w = 0;
3167#else
3168 /* sizeof(wchar_t) == 4 */
3169 Py_FatalError("Impossible unicode object state, wstr and str "
3170 "should share memory already.");
3171 return NULL;
3172#endif
3173 }
3174 else {
3175 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3176 (_PyUnicode_LENGTH(u) + 1));
3177 if (!_PyUnicode_WSTR(u)) {
3178 PyErr_NoMemory();
3179 return NULL;
3180 }
3181 if (!PyUnicode_IS_COMPACT_ASCII(u))
3182 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3183 w = _PyUnicode_WSTR(u);
3184 wchar_end = w + _PyUnicode_LENGTH(u);
3185
3186 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3187 one_byte = PyUnicode_1BYTE_DATA(u);
3188 for (; w < wchar_end; ++one_byte, ++w)
3189 *w = *one_byte;
3190 /* null-terminate the wstr */
3191 *w = 0;
3192 }
3193 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3194#if SIZEOF_WCHAR_T == 4
3195 two_bytes = PyUnicode_2BYTE_DATA(u);
3196 for (; w < wchar_end; ++two_bytes, ++w)
3197 *w = *two_bytes;
3198 /* null-terminate the wstr */
3199 *w = 0;
3200#else
3201 /* sizeof(wchar_t) == 2 */
3202 PyObject_FREE(_PyUnicode_WSTR(u));
3203 _PyUnicode_WSTR(u) = NULL;
3204 Py_FatalError("Impossible unicode object state, wstr "
3205 "and str should share memory already.");
3206 return NULL;
3207#endif
3208 }
3209 else {
3210 assert(0 && "This should never happen.");
3211 }
3212 }
3213 }
3214 if (size != NULL)
3215 *size = PyUnicode_WSTR_LENGTH(u);
3216 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219Py_UNICODE *
3220PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003222 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223}
3224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003225
Alexander Belopolsky40018472011-02-26 01:02:56 +00003226Py_ssize_t
3227PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228{
3229 if (!PyUnicode_Check(unicode)) {
3230 PyErr_BadArgument();
3231 goto onError;
3232 }
3233 return PyUnicode_GET_SIZE(unicode);
3234
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return -1;
3237}
3238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003239Py_ssize_t
3240PyUnicode_GetLength(PyObject *unicode)
3241{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003242 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003243 PyErr_BadArgument();
3244 return -1;
3245 }
3246
3247 return PyUnicode_GET_LENGTH(unicode);
3248}
3249
3250Py_UCS4
3251PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3252{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003253 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3254 PyErr_BadArgument();
3255 return (Py_UCS4)-1;
3256 }
3257 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3258 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259 return (Py_UCS4)-1;
3260 }
3261 return PyUnicode_READ_CHAR(unicode, index);
3262}
3263
3264int
3265PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3266{
3267 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003268 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003269 return -1;
3270 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003271 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3272 PyErr_SetString(PyExc_IndexError, "string index out of range");
3273 return -1;
3274 }
3275 if (_PyUnicode_Dirty(unicode))
3276 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003277 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3278 index, ch);
3279 return 0;
3280}
3281
Alexander Belopolsky40018472011-02-26 01:02:56 +00003282const char *
3283PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003284{
Victor Stinner42cb4622010-09-01 19:39:01 +00003285 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003286}
3287
Victor Stinner554f3f02010-06-16 23:33:54 +00003288/* create or adjust a UnicodeDecodeError */
3289static void
3290make_decode_exception(PyObject **exceptionObject,
3291 const char *encoding,
3292 const char *input, Py_ssize_t length,
3293 Py_ssize_t startpos, Py_ssize_t endpos,
3294 const char *reason)
3295{
3296 if (*exceptionObject == NULL) {
3297 *exceptionObject = PyUnicodeDecodeError_Create(
3298 encoding, input, length, startpos, endpos, reason);
3299 }
3300 else {
3301 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3302 goto onError;
3303 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3304 goto onError;
3305 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3306 goto onError;
3307 }
3308 return;
3309
3310onError:
3311 Py_DECREF(*exceptionObject);
3312 *exceptionObject = NULL;
3313}
3314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003315/* error handling callback helper:
3316 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003317 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003318 and adjust various state variables.
3319 return 0 on success, -1 on error
3320*/
3321
Alexander Belopolsky40018472011-02-26 01:02:56 +00003322static int
3323unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003324 const char *encoding, const char *reason,
3325 const char **input, const char **inend, Py_ssize_t *startinpos,
3326 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3327 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003328{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003329 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330
3331 PyObject *restuple = NULL;
3332 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003333 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003334 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003335 Py_ssize_t requiredsize;
3336 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003338 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003339 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003340 int res = -1;
3341
3342 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 *errorHandler = PyCodec_LookupError(errors);
3344 if (*errorHandler == NULL)
3345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 }
3347
Victor Stinner554f3f02010-06-16 23:33:54 +00003348 make_decode_exception(exceptionObject,
3349 encoding,
3350 *input, *inend - *input,
3351 *startinpos, *endinpos,
3352 reason);
3353 if (*exceptionObject == NULL)
3354 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003355
3356 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3357 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003359 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003360 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003361 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003362 }
3363 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003364 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003365
3366 /* Copy back the bytes variables, which might have been modified by the
3367 callback */
3368 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3369 if (!inputobj)
3370 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003371 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003373 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003374 *input = PyBytes_AS_STRING(inputobj);
3375 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003376 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003377 /* we can DECREF safely, as the exception has another reference,
3378 so the object won't go away. */
3379 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003381 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003382 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003383 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3385 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003386 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387
3388 /* need more space? (at least enough for what we
3389 have+the replacement+the rest of the string (starting
3390 at the new input position), so we won't have to check space
3391 when there are no errors in the rest of the string) */
3392 repptr = PyUnicode_AS_UNICODE(repunicode);
3393 repsize = PyUnicode_GET_SIZE(repunicode);
3394 requiredsize = *outpos + repsize + insize-newpos;
3395 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 if (requiredsize<2*outsize)
3397 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003398 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 goto onError;
3400 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 }
3402 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003403 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404 Py_UNICODE_COPY(*outptr, repptr, repsize);
3405 *outptr += repsize;
3406 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003407
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003408 /* we made it! */
3409 res = 0;
3410
Benjamin Peterson29060642009-01-31 22:14:21 +00003411 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 Py_XDECREF(restuple);
3413 return res;
3414}
3415
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416/* --- UTF-7 Codec -------------------------------------------------------- */
3417
Antoine Pitrou244651a2009-05-04 18:56:13 +00003418/* See RFC2152 for details. We encode conservatively and decode liberally. */
3419
3420/* Three simple macros defining base-64. */
3421
3422/* Is c a base-64 character? */
3423
3424#define IS_BASE64(c) \
3425 (((c) >= 'A' && (c) <= 'Z') || \
3426 ((c) >= 'a' && (c) <= 'z') || \
3427 ((c) >= '0' && (c) <= '9') || \
3428 (c) == '+' || (c) == '/')
3429
3430/* given that c is a base-64 character, what is its base-64 value? */
3431
3432#define FROM_BASE64(c) \
3433 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3434 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3435 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3436 (c) == '+' ? 62 : 63)
3437
3438/* What is the base-64 character of the bottom 6 bits of n? */
3439
3440#define TO_BASE64(n) \
3441 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3442
3443/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3444 * decoded as itself. We are permissive on decoding; the only ASCII
3445 * byte not decoding to itself is the + which begins a base64
3446 * string. */
3447
3448#define DECODE_DIRECT(c) \
3449 ((c) <= 127 && (c) != '+')
3450
3451/* The UTF-7 encoder treats ASCII characters differently according to
3452 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3453 * the above). See RFC2152. This array identifies these different
3454 * sets:
3455 * 0 : "Set D"
3456 * alphanumeric and '(),-./:?
3457 * 1 : "Set O"
3458 * !"#$%&*;<=>@[]^_`{|}
3459 * 2 : "whitespace"
3460 * ht nl cr sp
3461 * 3 : special (must be base64 encoded)
3462 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3463 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003464
Tim Petersced69f82003-09-16 20:30:58 +00003465static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003466char utf7_category[128] = {
3467/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3468 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3469/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3470 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3471/* sp ! " # $ % & ' ( ) * + , - . / */
3472 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3473/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3475/* @ A B C D E F G H I J K L M N O */
3476 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3477/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3479/* ` a b c d e f g h i j k l m n o */
3480 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3481/* p q r s t u v w x y z { | } ~ del */
3482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003483};
3484
Antoine Pitrou244651a2009-05-04 18:56:13 +00003485/* ENCODE_DIRECT: this character should be encoded as itself. The
3486 * answer depends on whether we are encoding set O as itself, and also
3487 * on whether we are encoding whitespace as itself. RFC2152 makes it
3488 * clear that the answers to these questions vary between
3489 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003490
Antoine Pitrou244651a2009-05-04 18:56:13 +00003491#define ENCODE_DIRECT(c, directO, directWS) \
3492 ((c) < 128 && (c) > 0 && \
3493 ((utf7_category[(c)] == 0) || \
3494 (directWS && (utf7_category[(c)] == 2)) || \
3495 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003496
Alexander Belopolsky40018472011-02-26 01:02:56 +00003497PyObject *
3498PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003499 Py_ssize_t size,
3500 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003501{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003502 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3503}
3504
Antoine Pitrou244651a2009-05-04 18:56:13 +00003505/* The decoder. The only state we preserve is our read position,
3506 * i.e. how many characters we have consumed. So if we end in the
3507 * middle of a shift sequence we have to back off the read position
3508 * and the output to the beginning of the sequence, otherwise we lose
3509 * all the shift state (seen bits, number of bits seen, high
3510 * surrogate). */
3511
Alexander Belopolsky40018472011-02-26 01:02:56 +00003512PyObject *
3513PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003514 Py_ssize_t size,
3515 const char *errors,
3516 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003517{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003519 Py_ssize_t startinpos;
3520 Py_ssize_t endinpos;
3521 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003522 const char *e;
3523 PyUnicodeObject *unicode;
3524 Py_UNICODE *p;
3525 const char *errmsg = "";
3526 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003527 Py_UNICODE *shiftOutStart;
3528 unsigned int base64bits = 0;
3529 unsigned long base64buffer = 0;
3530 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 PyObject *errorHandler = NULL;
3532 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003533
3534 unicode = _PyUnicode_New(size);
3535 if (!unicode)
3536 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003537 if (size == 0) {
3538 if (consumed)
3539 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003540 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003541 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003543 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003544 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003545 e = s + size;
3546
3547 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003548 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003550 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003551
Antoine Pitrou244651a2009-05-04 18:56:13 +00003552 if (inShift) { /* in a base-64 section */
3553 if (IS_BASE64(ch)) { /* consume a base-64 character */
3554 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3555 base64bits += 6;
3556 s++;
3557 if (base64bits >= 16) {
3558 /* we have enough bits for a UTF-16 value */
3559 Py_UNICODE outCh = (Py_UNICODE)
3560 (base64buffer >> (base64bits-16));
3561 base64bits -= 16;
3562 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3563 if (surrogate) {
3564 /* expecting a second surrogate */
3565 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3566#ifdef Py_UNICODE_WIDE
3567 *p++ = (((surrogate & 0x3FF)<<10)
3568 | (outCh & 0x3FF)) + 0x10000;
3569#else
3570 *p++ = surrogate;
3571 *p++ = outCh;
3572#endif
3573 surrogate = 0;
3574 }
3575 else {
3576 surrogate = 0;
3577 errmsg = "second surrogate missing";
3578 goto utf7Error;
3579 }
3580 }
3581 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3582 /* first surrogate */
3583 surrogate = outCh;
3584 }
3585 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3586 errmsg = "unexpected second surrogate";
3587 goto utf7Error;
3588 }
3589 else {
3590 *p++ = outCh;
3591 }
3592 }
3593 }
3594 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003595 inShift = 0;
3596 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003597 if (surrogate) {
3598 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003599 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003600 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003601 if (base64bits > 0) { /* left-over bits */
3602 if (base64bits >= 6) {
3603 /* We've seen at least one base-64 character */
3604 errmsg = "partial character in shift sequence";
3605 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003607 else {
3608 /* Some bits remain; they should be zero */
3609 if (base64buffer != 0) {
3610 errmsg = "non-zero padding bits in shift sequence";
3611 goto utf7Error;
3612 }
3613 }
3614 }
3615 if (ch != '-') {
3616 /* '-' is absorbed; other terminating
3617 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618 *p++ = ch;
3619 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003620 }
3621 }
3622 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003624 s++; /* consume '+' */
3625 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003626 s++;
3627 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003628 }
3629 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003630 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003631 shiftOutStart = p;
3632 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003633 }
3634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003635 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003636 *p++ = ch;
3637 s++;
3638 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003639 else {
3640 startinpos = s-starts;
3641 s++;
3642 errmsg = "unexpected special character";
3643 goto utf7Error;
3644 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003645 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003646utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 outpos = p-PyUnicode_AS_UNICODE(unicode);
3648 endinpos = s-starts;
3649 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 errors, &errorHandler,
3651 "utf7", errmsg,
3652 &starts, &e, &startinpos, &endinpos, &exc, &s,
3653 &unicode, &outpos, &p))
3654 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003655 }
3656
Antoine Pitrou244651a2009-05-04 18:56:13 +00003657 /* end of string */
3658
3659 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3660 /* if we're in an inconsistent state, that's an error */
3661 if (surrogate ||
3662 (base64bits >= 6) ||
3663 (base64bits > 0 && base64buffer != 0)) {
3664 outpos = p-PyUnicode_AS_UNICODE(unicode);
3665 endinpos = size;
3666 if (unicode_decode_call_errorhandler(
3667 errors, &errorHandler,
3668 "utf7", "unterminated shift sequence",
3669 &starts, &e, &startinpos, &endinpos, &exc, &s,
3670 &unicode, &outpos, &p))
3671 goto onError;
3672 if (s < e)
3673 goto restart;
3674 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003676
3677 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003678 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003679 if (inShift) {
3680 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003681 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003682 }
3683 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003684 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003685 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003686 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003687
Victor Stinnerfe226c02011-10-03 03:52:20 +02003688 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 goto onError;
3690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 Py_XDECREF(errorHandler);
3692 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003693#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003694 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 Py_DECREF(unicode);
3696 return NULL;
3697 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003698#endif
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003699 return (PyObject *)unicode;
3700
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003702 Py_XDECREF(errorHandler);
3703 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003704 Py_DECREF(unicode);
3705 return NULL;
3706}
3707
3708
Alexander Belopolsky40018472011-02-26 01:02:56 +00003709PyObject *
3710PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003711 Py_ssize_t size,
3712 int base64SetO,
3713 int base64WhiteSpace,
3714 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003715{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003716 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003717 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003718 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003719 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003720 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003721 unsigned int base64bits = 0;
3722 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003723 char * out;
3724 char * start;
3725
3726 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003728
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003729 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003730 return PyErr_NoMemory();
3731
Antoine Pitrou244651a2009-05-04 18:56:13 +00003732 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733 if (v == NULL)
3734 return NULL;
3735
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003736 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003737 for (;i < size; ++i) {
3738 Py_UNICODE ch = s[i];
3739
Antoine Pitrou244651a2009-05-04 18:56:13 +00003740 if (inShift) {
3741 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3742 /* shifting out */
3743 if (base64bits) { /* output remaining bits */
3744 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3745 base64buffer = 0;
3746 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003747 }
3748 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003749 /* Characters not in the BASE64 set implicitly unshift the sequence
3750 so no '-' is required, except if the character is itself a '-' */
3751 if (IS_BASE64(ch) || ch == '-') {
3752 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003753 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003754 *out++ = (char) ch;
3755 }
3756 else {
3757 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003758 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003759 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760 else { /* not in a shift sequence */
3761 if (ch == '+') {
3762 *out++ = '+';
3763 *out++ = '-';
3764 }
3765 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3766 *out++ = (char) ch;
3767 }
3768 else {
3769 *out++ = '+';
3770 inShift = 1;
3771 goto encode_char;
3772 }
3773 }
3774 continue;
3775encode_char:
3776#ifdef Py_UNICODE_WIDE
3777 if (ch >= 0x10000) {
3778 /* code first surrogate */
3779 base64bits += 16;
3780 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3781 while (base64bits >= 6) {
3782 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3783 base64bits -= 6;
3784 }
3785 /* prepare second surrogate */
3786 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3787 }
3788#endif
3789 base64bits += 16;
3790 base64buffer = (base64buffer << 16) | ch;
3791 while (base64bits >= 6) {
3792 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3793 base64bits -= 6;
3794 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 if (base64bits)
3797 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3798 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003799 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003800 if (_PyBytes_Resize(&v, out - start) < 0)
3801 return NULL;
3802 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003803}
3804
Antoine Pitrou244651a2009-05-04 18:56:13 +00003805#undef IS_BASE64
3806#undef FROM_BASE64
3807#undef TO_BASE64
3808#undef DECODE_DIRECT
3809#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003810
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811/* --- UTF-8 Codec -------------------------------------------------------- */
3812
Tim Petersced69f82003-09-16 20:30:58 +00003813static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003815 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3816 illegal prefix. See RFC 3629 for details */
3817 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3818 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003819 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3821 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3822 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3823 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003824 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3829 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3830 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3831 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3832 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833};
3834
Alexander Belopolsky40018472011-02-26 01:02:56 +00003835PyObject *
3836PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003837 Py_ssize_t size,
3838 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839{
Walter Dörwald69652032004-09-07 20:24:22 +00003840 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3841}
3842
Antoine Pitrouab868312009-01-10 15:40:25 +00003843/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3844#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3845
3846/* Mask to quickly check whether a C 'long' contains a
3847 non-ASCII, UTF8-encoded char. */
3848#if (SIZEOF_LONG == 8)
3849# define ASCII_CHAR_MASK 0x8080808080808080L
3850#elif (SIZEOF_LONG == 4)
3851# define ASCII_CHAR_MASK 0x80808080L
3852#else
3853# error C 'long' size should be either 4 or 8!
3854#endif
3855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856/* Scans a UTF-8 string and returns the maximum character to be expected,
3857 the size of the decoded unicode string and if any major errors were
3858 encountered.
3859
3860 This function does check basic UTF-8 sanity, it does however NOT CHECK
3861 if the string contains surrogates, and if all continuation bytes are
3862 within the correct ranges, these checks are performed in
3863 PyUnicode_DecodeUTF8Stateful.
3864
3865 If it sets has_errors to 1, it means the value of unicode_size and max_char
3866 will be bogus and you should not rely on useful information in them.
3867 */
3868static Py_UCS4
3869utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3870 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3871 int *has_errors)
3872{
3873 Py_ssize_t n;
3874 Py_ssize_t char_count = 0;
3875 Py_UCS4 max_char = 127, new_max;
3876 Py_UCS4 upper_bound;
3877 const unsigned char *p = (const unsigned char *)s;
3878 const unsigned char *end = p + string_size;
3879 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3880 int err = 0;
3881
3882 for (; p < end && !err; ++p, ++char_count) {
3883 /* Only check value if it's not a ASCII char... */
3884 if (*p < 0x80) {
3885 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3886 an explanation. */
3887 if (!((size_t) p & LONG_PTR_MASK)) {
3888 /* Help register allocation */
3889 register const unsigned char *_p = p;
3890 while (_p < aligned_end) {
3891 unsigned long value = *(unsigned long *) _p;
3892 if (value & ASCII_CHAR_MASK)
3893 break;
3894 _p += SIZEOF_LONG;
3895 char_count += SIZEOF_LONG;
3896 }
3897 p = _p;
3898 if (p == end)
3899 break;
3900 }
3901 }
3902 if (*p >= 0x80) {
3903 n = utf8_code_length[*p];
3904 new_max = max_char;
3905 switch (n) {
3906 /* invalid start byte */
3907 case 0:
3908 err = 1;
3909 break;
3910 case 2:
3911 /* Code points between 0x00FF and 0x07FF inclusive.
3912 Approximate the upper bound of the code point,
3913 if this flips over 255 we can be sure it will be more
3914 than 255 and the string will need 2 bytes per code coint,
3915 if it stays under or equal to 255, we can be sure 1 byte
3916 is enough.
3917 ((*p & 0b00011111) << 6) | 0b00111111 */
3918 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3919 if (max_char < upper_bound)
3920 new_max = upper_bound;
3921 /* Ensure we track at least that we left ASCII space. */
3922 if (new_max < 128)
3923 new_max = 128;
3924 break;
3925 case 3:
3926 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3927 always > 255 and <= 65535 and will always need 2 bytes. */
3928 if (max_char < 65535)
3929 new_max = 65535;
3930 break;
3931 case 4:
3932 /* Code point will be above 0xFFFF for sure in this case. */
3933 new_max = 65537;
3934 break;
3935 /* Internal error, this should be caught by the first if */
3936 case 1:
3937 default:
3938 assert(0 && "Impossible case in utf8_max_char_and_size");
3939 err = 1;
3940 }
3941 /* Instead of number of overall bytes for this code point,
3942 n containts the number of following bytes: */
3943 --n;
3944 /* Check if the follow up chars are all valid continuation bytes */
3945 if (n >= 1) {
3946 const unsigned char *cont;
3947 if ((p + n) >= end) {
3948 if (consumed == 0)
3949 /* incomplete data, non-incremental decoding */
3950 err = 1;
3951 break;
3952 }
3953 for (cont = p + 1; cont < (p + n); ++cont) {
3954 if ((*cont & 0xc0) != 0x80) {
3955 err = 1;
3956 break;
3957 }
3958 }
3959 p += n;
3960 }
3961 else
3962 err = 1;
3963 max_char = new_max;
3964 }
3965 }
3966
3967 if (unicode_size)
3968 *unicode_size = char_count;
3969 if (has_errors)
3970 *has_errors = err;
3971 return max_char;
3972}
3973
3974/* Similar to PyUnicode_WRITE but can also write into wstr field
3975 of the legacy unicode representation */
3976#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3977 do { \
3978 const int k_ = (kind); \
3979 if (k_ == PyUnicode_WCHAR_KIND) \
3980 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3981 else if (k_ == PyUnicode_1BYTE_KIND) \
3982 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3983 else if (k_ == PyUnicode_2BYTE_KIND) \
3984 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3985 else \
3986 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3987 } while (0)
3988
Alexander Belopolsky40018472011-02-26 01:02:56 +00003989PyObject *
3990PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 Py_ssize_t size,
3992 const char *errors,
3993 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003994{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003995 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003997 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t startinpos;
3999 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004000 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004002 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003 PyObject *errorHandler = NULL;
4004 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004005 Py_UCS4 maxchar = 0;
4006 Py_ssize_t unicode_size;
4007 Py_ssize_t i;
4008 int kind;
4009 void *data;
4010 int has_errors;
4011 Py_UNICODE *error_outptr;
4012#if SIZEOF_WCHAR_T == 2
4013 Py_ssize_t wchar_offset = 0;
4014#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015
Walter Dörwald69652032004-09-07 20:24:22 +00004016 if (size == 0) {
4017 if (consumed)
4018 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4022 consumed, &has_errors);
4023 if (has_errors) {
4024 unicode = _PyUnicode_New(size);
4025 if (!unicode)
4026 return NULL;
4027 kind = PyUnicode_WCHAR_KIND;
4028 data = PyUnicode_AS_UNICODE(unicode);
4029 assert(data != NULL);
4030 }
4031 else {
4032 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4033 if (!unicode)
4034 return NULL;
4035 /* When the string is ASCII only, just use memcpy and return.
4036 unicode_size may be != size if there is an incomplete UTF-8
4037 sequence at the end of the ASCII block. */
4038 if (maxchar < 128 && size == unicode_size) {
4039 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4040 return (PyObject *)unicode;
4041 }
4042 kind = PyUnicode_KIND(unicode);
4043 data = PyUnicode_DATA(unicode);
4044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004046 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004048 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
4050 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004051 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052
4053 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004054 /* Fast path for runs of ASCII characters. Given that common UTF-8
4055 input will consist of an overwhelming majority of ASCII
4056 characters, we try to optimize for this case by checking
4057 as many characters as a C 'long' can contain.
4058 First, check if we can do an aligned read, as most CPUs have
4059 a penalty for unaligned reads.
4060 */
4061 if (!((size_t) s & LONG_PTR_MASK)) {
4062 /* Help register allocation */
4063 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004065 while (_s < aligned_end) {
4066 /* Read a whole long at a time (either 4 or 8 bytes),
4067 and do a fast unrolled copy if it only contains ASCII
4068 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004069 unsigned long value = *(unsigned long *) _s;
4070 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004071 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4073 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4074 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4075 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004076#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4078 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4079 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4080 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004081#endif
4082 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004084 }
4085 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004087 if (s == e)
4088 break;
4089 ch = (unsigned char)*s;
4090 }
4091 }
4092
4093 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004094 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 s++;
4096 continue;
4097 }
4098
4099 n = utf8_code_length[ch];
4100
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004101 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 if (consumed)
4103 break;
4104 else {
4105 errmsg = "unexpected end of data";
4106 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 endinpos = startinpos+1;
4108 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4109 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 goto utf8Error;
4111 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
4114 switch (n) {
4115
4116 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004117 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 startinpos = s-starts;
4119 endinpos = startinpos+1;
4120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121
4122 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004123 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 startinpos = s-starts;
4125 endinpos = startinpos+1;
4126 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
4128 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004129 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004130 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004132 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto utf8Error;
4134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004136 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 break;
4139
4140 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004141 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4142 will result in surrogates in range d800-dfff. Surrogates are
4143 not valid UTF-8 so they are rejected.
4144 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4145 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004146 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004147 (s[2] & 0xc0) != 0x80 ||
4148 ((unsigned char)s[0] == 0xE0 &&
4149 (unsigned char)s[1] < 0xA0) ||
4150 ((unsigned char)s[0] == 0xED &&
4151 (unsigned char)s[1] > 0x9F)) {
4152 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004154 endinpos = startinpos + 1;
4155
4156 /* if s[1] first two bits are 1 and 0, then the invalid
4157 continuation byte is s[2], so increment endinpos by 1,
4158 if not, s[1] is invalid and endinpos doesn't need to
4159 be incremented. */
4160 if ((s[1] & 0xC0) == 0x80)
4161 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 goto utf8Error;
4163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004165 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004167 break;
4168
4169 case 4:
4170 if ((s[1] & 0xc0) != 0x80 ||
4171 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004172 (s[3] & 0xc0) != 0x80 ||
4173 ((unsigned char)s[0] == 0xF0 &&
4174 (unsigned char)s[1] < 0x90) ||
4175 ((unsigned char)s[0] == 0xF4 &&
4176 (unsigned char)s[1] > 0x8F)) {
4177 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 endinpos = startinpos + 1;
4180 if ((s[1] & 0xC0) == 0x80) {
4181 endinpos++;
4182 if ((s[2] & 0xC0) == 0x80)
4183 endinpos++;
4184 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004185 goto utf8Error;
4186 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004187 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4189 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004191 /* If the string is flexible or we have native UCS-4, write
4192 directly.. */
4193 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4194 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 else {
4197 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004199 /* translate from 10000..10FFFF to 0..FFFF */
4200 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004202 /* high surrogate = top 10 bits added to D800 */
4203 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4204 (Py_UNICODE)(0xD800 + (ch >> 10)));
4205
4206 /* low surrogate = bottom 10 bits added to DC00 */
4207 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4208 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4209 }
4210#if SIZEOF_WCHAR_T == 2
4211 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004212#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 }
4215 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004217
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 /* If this is not yet a resizable string, make it one.. */
4220 if (kind != PyUnicode_WCHAR_KIND) {
4221 const Py_UNICODE *u;
4222 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4223 if (!new_unicode)
4224 goto onError;
4225 u = PyUnicode_AsUnicode((PyObject *)unicode);
4226 if (!u)
4227 goto onError;
4228#if SIZEOF_WCHAR_T == 2
4229 i += wchar_offset;
4230#endif
4231 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4232 Py_DECREF(unicode);
4233 unicode = new_unicode;
4234 kind = 0;
4235 data = PyUnicode_AS_UNICODE(new_unicode);
4236 assert(data != NULL);
4237 }
4238 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 if (unicode_decode_call_errorhandler(
4240 errors, &errorHandler,
4241 "utf8", errmsg,
4242 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245 /* Update data because unicode_decode_call_errorhandler might have
4246 re-created or resized the unicode object. */
4247 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004248 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250 /* Ensure the unicode_size calculation above was correct: */
4251 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4252
Walter Dörwald69652032004-09-07 20:24:22 +00004253 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004256 /* Adjust length and ready string when it contained errors and
4257 is of the old resizable kind. */
4258 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004259 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 goto onError;
4261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 Py_XDECREF(errorHandler);
4264 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004265#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004266 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 Py_DECREF(unicode);
4268 return NULL;
4269 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004270#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 return (PyObject *)unicode;
4272
Benjamin Peterson29060642009-01-31 22:14:21 +00004273 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 Py_XDECREF(errorHandler);
4275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 Py_DECREF(unicode);
4277 return NULL;
4278}
4279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004281
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004282#ifdef __APPLE__
4283
4284/* Simplified UTF-8 decoder using surrogateescape error handler,
4285 used to decode the command line arguments on Mac OS X. */
4286
4287wchar_t*
4288_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4289{
4290 int n;
4291 const char *e;
4292 wchar_t *unicode, *p;
4293
4294 /* Note: size will always be longer than the resulting Unicode
4295 character count */
4296 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4297 PyErr_NoMemory();
4298 return NULL;
4299 }
4300 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4301 if (!unicode)
4302 return NULL;
4303
4304 /* Unpack UTF-8 encoded data */
4305 p = unicode;
4306 e = s + size;
4307 while (s < e) {
4308 Py_UCS4 ch = (unsigned char)*s;
4309
4310 if (ch < 0x80) {
4311 *p++ = (wchar_t)ch;
4312 s++;
4313 continue;
4314 }
4315
4316 n = utf8_code_length[ch];
4317 if (s + n > e) {
4318 goto surrogateescape;
4319 }
4320
4321 switch (n) {
4322 case 0:
4323 case 1:
4324 goto surrogateescape;
4325
4326 case 2:
4327 if ((s[1] & 0xc0) != 0x80)
4328 goto surrogateescape;
4329 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4330 assert ((ch > 0x007F) && (ch <= 0x07FF));
4331 *p++ = (wchar_t)ch;
4332 break;
4333
4334 case 3:
4335 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4336 will result in surrogates in range d800-dfff. Surrogates are
4337 not valid UTF-8 so they are rejected.
4338 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4339 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4340 if ((s[1] & 0xc0) != 0x80 ||
4341 (s[2] & 0xc0) != 0x80 ||
4342 ((unsigned char)s[0] == 0xE0 &&
4343 (unsigned char)s[1] < 0xA0) ||
4344 ((unsigned char)s[0] == 0xED &&
4345 (unsigned char)s[1] > 0x9F)) {
4346
4347 goto surrogateescape;
4348 }
4349 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4350 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004352 break;
4353
4354 case 4:
4355 if ((s[1] & 0xc0) != 0x80 ||
4356 (s[2] & 0xc0) != 0x80 ||
4357 (s[3] & 0xc0) != 0x80 ||
4358 ((unsigned char)s[0] == 0xF0 &&
4359 (unsigned char)s[1] < 0x90) ||
4360 ((unsigned char)s[0] == 0xF4 &&
4361 (unsigned char)s[1] > 0x8F)) {
4362 goto surrogateescape;
4363 }
4364 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4365 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4366 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4367
4368#if SIZEOF_WCHAR_T == 4
4369 *p++ = (wchar_t)ch;
4370#else
4371 /* compute and append the two surrogates: */
4372
4373 /* translate from 10000..10FFFF to 0..FFFF */
4374 ch -= 0x10000;
4375
4376 /* high surrogate = top 10 bits added to D800 */
4377 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4378
4379 /* low surrogate = bottom 10 bits added to DC00 */
4380 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4381#endif
4382 break;
4383 }
4384 s += n;
4385 continue;
4386
4387 surrogateescape:
4388 *p++ = 0xDC00 + ch;
4389 s++;
4390 }
4391 *p = L'\0';
4392 return unicode;
4393}
4394
4395#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397/* Primary internal function which creates utf8 encoded bytes objects.
4398
4399 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004400 and allocate exactly as much space needed at the end. Else allocate the
4401 maximum possible needed (4 result bytes per Unicode character), and return
4402 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004403*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004404PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004405_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406{
Tim Peters602f7402002-04-27 18:03:26 +00004407#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004408
Guido van Rossum98297ee2007-11-06 21:34:58 +00004409 Py_ssize_t i; /* index into s of next input byte */
4410 PyObject *result; /* result string object */
4411 char *p; /* next free byte in output buffer */
4412 Py_ssize_t nallocated; /* number of result bytes allocated */
4413 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004414 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004415 PyObject *errorHandler = NULL;
4416 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417 int kind;
4418 void *data;
4419 Py_ssize_t size;
4420 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4421#if SIZEOF_WCHAR_T == 2
4422 Py_ssize_t wchar_offset = 0;
4423#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004425 if (!PyUnicode_Check(unicode)) {
4426 PyErr_BadArgument();
4427 return NULL;
4428 }
4429
4430 if (PyUnicode_READY(unicode) == -1)
4431 return NULL;
4432
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004433 if (PyUnicode_UTF8(unicode))
4434 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4435 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004436
4437 kind = PyUnicode_KIND(unicode);
4438 data = PyUnicode_DATA(unicode);
4439 size = PyUnicode_GET_LENGTH(unicode);
4440
Tim Peters602f7402002-04-27 18:03:26 +00004441 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442
Tim Peters602f7402002-04-27 18:03:26 +00004443 if (size <= MAX_SHORT_UNICHARS) {
4444 /* Write into the stack buffer; nallocated can't overflow.
4445 * At the end, we'll allocate exactly as much heap space as it
4446 * turns out we need.
4447 */
4448 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004449 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004450 p = stackbuf;
4451 }
4452 else {
4453 /* Overallocate on the heap, and give the excess back at the end. */
4454 nallocated = size * 4;
4455 if (nallocated / 4 != size) /* overflow! */
4456 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004457 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004458 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004459 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004460 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004461 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004462
Tim Peters602f7402002-04-27 18:03:26 +00004463 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004465
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004466 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004467 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004469
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004471 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004472 *p++ = (char)(0xc0 | (ch >> 6));
4473 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004474 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 Py_ssize_t newpos;
4476 PyObject *rep;
4477 Py_ssize_t repsize, k, startpos;
4478 startpos = i-1;
4479#if SIZEOF_WCHAR_T == 2
4480 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004481#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 rep = unicode_encode_call_errorhandler(
4483 errors, &errorHandler, "utf-8", "surrogates not allowed",
4484 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4485 &exc, startpos, startpos+1, &newpos);
4486 if (!rep)
4487 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004489 if (PyBytes_Check(rep))
4490 repsize = PyBytes_GET_SIZE(rep);
4491 else
4492 repsize = PyUnicode_GET_SIZE(rep);
4493
4494 if (repsize > 4) {
4495 Py_ssize_t offset;
4496
4497 if (result == NULL)
4498 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004499 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004500 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004502 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4503 /* integer overflow */
4504 PyErr_NoMemory();
4505 goto error;
4506 }
4507 nallocated += repsize - 4;
4508 if (result != NULL) {
4509 if (_PyBytes_Resize(&result, nallocated) < 0)
4510 goto error;
4511 } else {
4512 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004513 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004514 goto error;
4515 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4516 }
4517 p = PyBytes_AS_STRING(result) + offset;
4518 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 if (PyBytes_Check(rep)) {
4521 char *prep = PyBytes_AS_STRING(rep);
4522 for(k = repsize; k > 0; k--)
4523 *p++ = *prep++;
4524 } else /* rep is unicode */ {
4525 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4526 Py_UNICODE c;
4527
4528 for(k=0; k<repsize; k++) {
4529 c = prep[k];
4530 if (0x80 <= c) {
4531 raise_encode_exception(&exc, "utf-8",
4532 PyUnicode_AS_UNICODE(unicode),
4533 size, i-1, i,
4534 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004535 goto error;
4536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004538 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004540 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004541 } else if (ch < 0x10000) {
4542 *p++ = (char)(0xe0 | (ch >> 12));
4543 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4544 *p++ = (char)(0x80 | (ch & 0x3f));
4545 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004546 /* Encode UCS4 Unicode ordinals */
4547 *p++ = (char)(0xf0 | (ch >> 18));
4548 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4549 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4550 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004551#if SIZEOF_WCHAR_T == 2
4552 wchar_offset++;
4553#endif
Tim Peters602f7402002-04-27 18:03:26 +00004554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004556
Guido van Rossum98297ee2007-11-06 21:34:58 +00004557 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004558 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004559 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004560 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004561 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004562 }
4563 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004564 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004565 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004566 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004567 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004569
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004570 Py_XDECREF(errorHandler);
4571 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004572 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004573 error:
4574 Py_XDECREF(errorHandler);
4575 Py_XDECREF(exc);
4576 Py_XDECREF(result);
4577 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004578
Tim Peters602f7402002-04-27 18:03:26 +00004579#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580}
4581
Alexander Belopolsky40018472011-02-26 01:02:56 +00004582PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4584 Py_ssize_t size,
4585 const char *errors)
4586{
4587 PyObject *v, *unicode;
4588
4589 unicode = PyUnicode_FromUnicode(s, size);
4590 if (unicode == NULL)
4591 return NULL;
4592 v = _PyUnicode_AsUTF8String(unicode, errors);
4593 Py_DECREF(unicode);
4594 return v;
4595}
4596
4597PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004598PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601}
4602
Walter Dörwald41980ca2007-08-16 21:55:45 +00004603/* --- UTF-32 Codec ------------------------------------------------------- */
4604
4605PyObject *
4606PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 Py_ssize_t size,
4608 const char *errors,
4609 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004610{
4611 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4612}
4613
4614PyObject *
4615PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 Py_ssize_t size,
4617 const char *errors,
4618 int *byteorder,
4619 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004620{
4621 const char *starts = s;
4622 Py_ssize_t startinpos;
4623 Py_ssize_t endinpos;
4624 Py_ssize_t outpos;
4625 PyUnicodeObject *unicode;
4626 Py_UNICODE *p;
4627#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004628 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004629 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004630#else
4631 const int pairs = 0;
4632#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004633 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004634 int bo = 0; /* assume native ordering by default */
4635 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004636 /* Offsets from q for retrieving bytes in the right order. */
4637#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4638 int iorder[] = {0, 1, 2, 3};
4639#else
4640 int iorder[] = {3, 2, 1, 0};
4641#endif
4642 PyObject *errorHandler = NULL;
4643 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004644
Walter Dörwald41980ca2007-08-16 21:55:45 +00004645 q = (unsigned char *)s;
4646 e = q + size;
4647
4648 if (byteorder)
4649 bo = *byteorder;
4650
4651 /* Check for BOM marks (U+FEFF) in the input and adjust current
4652 byte order setting accordingly. In native mode, the leading BOM
4653 mark is skipped, in all other modes, it is copied to the output
4654 stream as-is (giving a ZWNBSP character). */
4655 if (bo == 0) {
4656 if (size >= 4) {
4657 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 if (bom == 0x0000FEFF) {
4661 q += 4;
4662 bo = -1;
4663 }
4664 else if (bom == 0xFFFE0000) {
4665 q += 4;
4666 bo = 1;
4667 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004668#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 if (bom == 0x0000FEFF) {
4670 q += 4;
4671 bo = 1;
4672 }
4673 else if (bom == 0xFFFE0000) {
4674 q += 4;
4675 bo = -1;
4676 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004677#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004678 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004679 }
4680
4681 if (bo == -1) {
4682 /* force LE */
4683 iorder[0] = 0;
4684 iorder[1] = 1;
4685 iorder[2] = 2;
4686 iorder[3] = 3;
4687 }
4688 else if (bo == 1) {
4689 /* force BE */
4690 iorder[0] = 3;
4691 iorder[1] = 2;
4692 iorder[2] = 1;
4693 iorder[3] = 0;
4694 }
4695
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004696 /* On narrow builds we split characters outside the BMP into two
4697 codepoints => count how much extra space we need. */
4698#ifndef Py_UNICODE_WIDE
4699 for (qq = q; qq < e; qq += 4)
4700 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4701 pairs++;
4702#endif
4703
4704 /* This might be one to much, because of a BOM */
4705 unicode = _PyUnicode_New((size+3)/4+pairs);
4706 if (!unicode)
4707 return NULL;
4708 if (size == 0)
4709 return (PyObject *)unicode;
4710
4711 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004713
Walter Dörwald41980ca2007-08-16 21:55:45 +00004714 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004715 Py_UCS4 ch;
4716 /* remaining bytes at the end? (size should be divisible by 4) */
4717 if (e-q<4) {
4718 if (consumed)
4719 break;
4720 errmsg = "truncated data";
4721 startinpos = ((const char *)q)-starts;
4722 endinpos = ((const char *)e)-starts;
4723 goto utf32Error;
4724 /* The remaining input chars are ignored if the callback
4725 chooses to skip the input */
4726 }
4727 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4728 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004729
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 if (ch >= 0x110000)
4731 {
4732 errmsg = "codepoint not in range(0x110000)";
4733 startinpos = ((const char *)q)-starts;
4734 endinpos = startinpos+4;
4735 goto utf32Error;
4736 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004737#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004738 if (ch >= 0x10000)
4739 {
4740 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4741 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4742 }
4743 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004744#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 *p++ = ch;
4746 q += 4;
4747 continue;
4748 utf32Error:
4749 outpos = p-PyUnicode_AS_UNICODE(unicode);
4750 if (unicode_decode_call_errorhandler(
4751 errors, &errorHandler,
4752 "utf32", errmsg,
4753 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4754 &unicode, &outpos, &p))
4755 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004756 }
4757
4758 if (byteorder)
4759 *byteorder = bo;
4760
4761 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004762 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004763
4764 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004765 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004766 goto onError;
4767
4768 Py_XDECREF(errorHandler);
4769 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004770#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004771 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 Py_DECREF(unicode);
4773 return NULL;
4774 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004775#endif
Walter Dörwald41980ca2007-08-16 21:55:45 +00004776 return (PyObject *)unicode;
4777
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004779 Py_DECREF(unicode);
4780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
4782 return NULL;
4783}
4784
4785PyObject *
4786PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 Py_ssize_t size,
4788 const char *errors,
4789 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004790{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004791 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004792 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004793 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004794#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004795 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004796#else
4797 const int pairs = 0;
4798#endif
4799 /* Offsets from p for storing byte pairs in the right order. */
4800#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4801 int iorder[] = {0, 1, 2, 3};
4802#else
4803 int iorder[] = {3, 2, 1, 0};
4804#endif
4805
Benjamin Peterson29060642009-01-31 22:14:21 +00004806#define STORECHAR(CH) \
4807 do { \
4808 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4809 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4810 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4811 p[iorder[0]] = (CH) & 0xff; \
4812 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004813 } while(0)
4814
4815 /* In narrow builds we can output surrogate pairs as one codepoint,
4816 so we need less space. */
4817#ifndef Py_UNICODE_WIDE
4818 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4820 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4821 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004822#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004823 nsize = (size - pairs + (byteorder == 0));
4824 bytesize = nsize * 4;
4825 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004826 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004827 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004828 if (v == NULL)
4829 return NULL;
4830
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004831 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004832 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004835 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836
4837 if (byteorder == -1) {
4838 /* force LE */
4839 iorder[0] = 0;
4840 iorder[1] = 1;
4841 iorder[2] = 2;
4842 iorder[3] = 3;
4843 }
4844 else if (byteorder == 1) {
4845 /* force BE */
4846 iorder[0] = 3;
4847 iorder[1] = 2;
4848 iorder[2] = 1;
4849 iorder[3] = 0;
4850 }
4851
4852 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004854#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4856 Py_UCS4 ch2 = *s;
4857 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4858 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4859 s++;
4860 size--;
4861 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004862 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004863#endif
4864 STORECHAR(ch);
4865 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004866
4867 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004868 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004869#undef STORECHAR
4870}
4871
Alexander Belopolsky40018472011-02-26 01:02:56 +00004872PyObject *
4873PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874{
4875 if (!PyUnicode_Check(unicode)) {
4876 PyErr_BadArgument();
4877 return NULL;
4878 }
4879 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 PyUnicode_GET_SIZE(unicode),
4881 NULL,
4882 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883}
4884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885/* --- UTF-16 Codec ------------------------------------------------------- */
4886
Tim Peters772747b2001-08-09 22:21:55 +00004887PyObject *
4888PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 Py_ssize_t size,
4890 const char *errors,
4891 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892{
Walter Dörwald69652032004-09-07 20:24:22 +00004893 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4894}
4895
Antoine Pitrouab868312009-01-10 15:40:25 +00004896/* Two masks for fast checking of whether a C 'long' may contain
4897 UTF16-encoded surrogate characters. This is an efficient heuristic,
4898 assuming that non-surrogate characters with a code point >= 0x8000 are
4899 rare in most input.
4900 FAST_CHAR_MASK is used when the input is in native byte ordering,
4901 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004902*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004903#if (SIZEOF_LONG == 8)
4904# define FAST_CHAR_MASK 0x8000800080008000L
4905# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4906#elif (SIZEOF_LONG == 4)
4907# define FAST_CHAR_MASK 0x80008000L
4908# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4909#else
4910# error C 'long' size should be either 4 or 8!
4911#endif
4912
Walter Dörwald69652032004-09-07 20:24:22 +00004913PyObject *
4914PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 Py_ssize_t size,
4916 const char *errors,
4917 int *byteorder,
4918 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004919{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004921 Py_ssize_t startinpos;
4922 Py_ssize_t endinpos;
4923 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 PyUnicodeObject *unicode;
4925 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004926 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004927 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004928 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004929 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004930 /* Offsets from q for retrieving byte pairs in the right order. */
4931#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4932 int ihi = 1, ilo = 0;
4933#else
4934 int ihi = 0, ilo = 1;
4935#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 PyObject *errorHandler = NULL;
4937 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938
4939 /* Note: size will always be longer than the resulting Unicode
4940 character count */
4941 unicode = _PyUnicode_New(size);
4942 if (!unicode)
4943 return NULL;
4944 if (size == 0)
4945 return (PyObject *)unicode;
4946
4947 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004948 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004949 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004950 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951
4952 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004953 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004955 /* Check for BOM marks (U+FEFF) in the input and adjust current
4956 byte order setting accordingly. In native mode, the leading BOM
4957 mark is skipped, in all other modes, it is copied to the output
4958 stream as-is (giving a ZWNBSP character). */
4959 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004960 if (size >= 2) {
4961 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004962#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 if (bom == 0xFEFF) {
4964 q += 2;
4965 bo = -1;
4966 }
4967 else if (bom == 0xFFFE) {
4968 q += 2;
4969 bo = 1;
4970 }
Tim Petersced69f82003-09-16 20:30:58 +00004971#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004972 if (bom == 0xFEFF) {
4973 q += 2;
4974 bo = 1;
4975 }
4976 else if (bom == 0xFFFE) {
4977 q += 2;
4978 bo = -1;
4979 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004980#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983
Tim Peters772747b2001-08-09 22:21:55 +00004984 if (bo == -1) {
4985 /* force LE */
4986 ihi = 1;
4987 ilo = 0;
4988 }
4989 else if (bo == 1) {
4990 /* force BE */
4991 ihi = 0;
4992 ilo = 1;
4993 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004994#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4995 native_ordering = ilo < ihi;
4996#else
4997 native_ordering = ilo > ihi;
4998#endif
Tim Peters772747b2001-08-09 22:21:55 +00004999
Antoine Pitrouab868312009-01-10 15:40:25 +00005000 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005001 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005003 /* First check for possible aligned read of a C 'long'. Unaligned
5004 reads are more expensive, better to defer to another iteration. */
5005 if (!((size_t) q & LONG_PTR_MASK)) {
5006 /* Fast path for runs of non-surrogate chars. */
5007 register const unsigned char *_q = q;
5008 Py_UNICODE *_p = p;
5009 if (native_ordering) {
5010 /* Native ordering is simple: as long as the input cannot
5011 possibly contain a surrogate char, do an unrolled copy
5012 of several 16-bit code points to the target object.
5013 The non-surrogate check is done on several input bytes
5014 at a time (as many as a C 'long' can contain). */
5015 while (_q < aligned_end) {
5016 unsigned long data = * (unsigned long *) _q;
5017 if (data & FAST_CHAR_MASK)
5018 break;
5019 _p[0] = ((unsigned short *) _q)[0];
5020 _p[1] = ((unsigned short *) _q)[1];
5021#if (SIZEOF_LONG == 8)
5022 _p[2] = ((unsigned short *) _q)[2];
5023 _p[3] = ((unsigned short *) _q)[3];
5024#endif
5025 _q += SIZEOF_LONG;
5026 _p += SIZEOF_LONG / 2;
5027 }
5028 }
5029 else {
5030 /* Byteswapped ordering is similar, but we must decompose
5031 the copy bytewise, and take care of zero'ing out the
5032 upper bytes if the target object is in 32-bit units
5033 (that is, in UCS-4 builds). */
5034 while (_q < aligned_end) {
5035 unsigned long data = * (unsigned long *) _q;
5036 if (data & SWAPPED_FAST_CHAR_MASK)
5037 break;
5038 /* Zero upper bytes in UCS-4 builds */
5039#if (Py_UNICODE_SIZE > 2)
5040 _p[0] = 0;
5041 _p[1] = 0;
5042#if (SIZEOF_LONG == 8)
5043 _p[2] = 0;
5044 _p[3] = 0;
5045#endif
5046#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005047 /* Issue #4916; UCS-4 builds on big endian machines must
5048 fill the two last bytes of each 4-byte unit. */
5049#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5050# define OFF 2
5051#else
5052# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005053#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005054 ((unsigned char *) _p)[OFF + 1] = _q[0];
5055 ((unsigned char *) _p)[OFF + 0] = _q[1];
5056 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5057 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5058#if (SIZEOF_LONG == 8)
5059 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5060 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5061 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5062 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5063#endif
5064#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005065 _q += SIZEOF_LONG;
5066 _p += SIZEOF_LONG / 2;
5067 }
5068 }
5069 p = _p;
5070 q = _q;
5071 if (q >= e)
5072 break;
5073 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075
Benjamin Peterson14339b62009-01-31 16:36:08 +00005076 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005077
5078 if (ch < 0xD800 || ch > 0xDFFF) {
5079 *p++ = ch;
5080 continue;
5081 }
5082
5083 /* UTF-16 code pair: */
5084 if (q > e) {
5085 errmsg = "unexpected end of data";
5086 startinpos = (((const char *)q) - 2) - starts;
5087 endinpos = ((const char *)e) + 1 - starts;
5088 goto utf16Error;
5089 }
5090 if (0xD800 <= ch && ch <= 0xDBFF) {
5091 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5092 q += 2;
5093 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005094#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 *p++ = ch;
5096 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005097#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005099#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 continue;
5101 }
5102 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005103 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 startinpos = (((const char *)q)-4)-starts;
5105 endinpos = startinpos+2;
5106 goto utf16Error;
5107 }
5108
Benjamin Peterson14339b62009-01-31 16:36:08 +00005109 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 errmsg = "illegal encoding";
5111 startinpos = (((const char *)q)-2)-starts;
5112 endinpos = startinpos+2;
5113 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005114
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 utf16Error:
5116 outpos = p - PyUnicode_AS_UNICODE(unicode);
5117 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005118 errors,
5119 &errorHandler,
5120 "utf16", errmsg,
5121 &starts,
5122 (const char **)&e,
5123 &startinpos,
5124 &endinpos,
5125 &exc,
5126 (const char **)&q,
5127 &unicode,
5128 &outpos,
5129 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005132 /* remaining byte at the end? (size should be even) */
5133 if (e == q) {
5134 if (!consumed) {
5135 errmsg = "truncated data";
5136 startinpos = ((const char *)q) - starts;
5137 endinpos = ((const char *)e) + 1 - starts;
5138 outpos = p - PyUnicode_AS_UNICODE(unicode);
5139 if (unicode_decode_call_errorhandler(
5140 errors,
5141 &errorHandler,
5142 "utf16", errmsg,
5143 &starts,
5144 (const char **)&e,
5145 &startinpos,
5146 &endinpos,
5147 &exc,
5148 (const char **)&q,
5149 &unicode,
5150 &outpos,
5151 &p))
5152 goto onError;
5153 /* The remaining input chars are ignored if the callback
5154 chooses to skip the input */
5155 }
5156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157
5158 if (byteorder)
5159 *byteorder = bo;
5160
Walter Dörwald69652032004-09-07 20:24:22 +00005161 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005163
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005165 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 goto onError;
5167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005168 Py_XDECREF(errorHandler);
5169 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005170#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005171 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005172 Py_DECREF(unicode);
5173 return NULL;
5174 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005175#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 return (PyObject *)unicode;
5177
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005180 Py_XDECREF(errorHandler);
5181 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 return NULL;
5183}
5184
Antoine Pitrouab868312009-01-10 15:40:25 +00005185#undef FAST_CHAR_MASK
5186#undef SWAPPED_FAST_CHAR_MASK
5187
Tim Peters772747b2001-08-09 22:21:55 +00005188PyObject *
5189PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005190 Py_ssize_t size,
5191 const char *errors,
5192 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005194 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005195 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005196 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005197#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005198 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005199#else
5200 const int pairs = 0;
5201#endif
Tim Peters772747b2001-08-09 22:21:55 +00005202 /* Offsets from p for storing byte pairs in the right order. */
5203#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5204 int ihi = 1, ilo = 0;
5205#else
5206 int ihi = 0, ilo = 1;
5207#endif
5208
Benjamin Peterson29060642009-01-31 22:14:21 +00005209#define STORECHAR(CH) \
5210 do { \
5211 p[ihi] = ((CH) >> 8) & 0xff; \
5212 p[ilo] = (CH) & 0xff; \
5213 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005214 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005216#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005217 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 if (s[i] >= 0x10000)
5219 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005220#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005221 /* 2 * (size + pairs + (byteorder == 0)) */
5222 if (size > PY_SSIZE_T_MAX ||
5223 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005225 nsize = size + pairs + (byteorder == 0);
5226 bytesize = nsize * 2;
5227 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005229 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230 if (v == NULL)
5231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005233 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005236 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005237 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005238
5239 if (byteorder == -1) {
5240 /* force LE */
5241 ihi = 1;
5242 ilo = 0;
5243 }
5244 else if (byteorder == 1) {
5245 /* force BE */
5246 ihi = 0;
5247 ilo = 1;
5248 }
5249
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005250 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005251 Py_UNICODE ch = *s++;
5252 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005253#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 if (ch >= 0x10000) {
5255 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5256 ch = 0xD800 | ((ch-0x10000) >> 10);
5257 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005258#endif
Tim Peters772747b2001-08-09 22:21:55 +00005259 STORECHAR(ch);
5260 if (ch2)
5261 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005262 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005263
5264 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005266#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267}
5268
Alexander Belopolsky40018472011-02-26 01:02:56 +00005269PyObject *
5270PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271{
5272 if (!PyUnicode_Check(unicode)) {
5273 PyErr_BadArgument();
5274 return NULL;
5275 }
5276 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 PyUnicode_GET_SIZE(unicode),
5278 NULL,
5279 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280}
5281
5282/* --- Unicode Escape Codec ----------------------------------------------- */
5283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005284/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5285 if all the escapes in the string make it still a valid ASCII string.
5286 Returns -1 if any escapes were found which cause the string to
5287 pop out of ASCII range. Otherwise returns the length of the
5288 required buffer to hold the string.
5289 */
5290Py_ssize_t
5291length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5292{
5293 const unsigned char *p = (const unsigned char *)s;
5294 const unsigned char *end = p + size;
5295 Py_ssize_t length = 0;
5296
5297 if (size < 0)
5298 return -1;
5299
5300 for (; p < end; ++p) {
5301 if (*p > 127) {
5302 /* Non-ASCII */
5303 return -1;
5304 }
5305 else if (*p != '\\') {
5306 /* Normal character */
5307 ++length;
5308 }
5309 else {
5310 /* Backslash-escape, check next char */
5311 ++p;
5312 /* Escape sequence reaches till end of string or
5313 non-ASCII follow-up. */
5314 if (p >= end || *p > 127)
5315 return -1;
5316 switch (*p) {
5317 case '\n':
5318 /* backslash + \n result in zero characters */
5319 break;
5320 case '\\': case '\'': case '\"':
5321 case 'b': case 'f': case 't':
5322 case 'n': case 'r': case 'v': case 'a':
5323 ++length;
5324 break;
5325 case '0': case '1': case '2': case '3':
5326 case '4': case '5': case '6': case '7':
5327 case 'x': case 'u': case 'U': case 'N':
5328 /* these do not guarantee ASCII characters */
5329 return -1;
5330 default:
5331 /* count the backslash + the other character */
5332 length += 2;
5333 }
5334 }
5335 }
5336 return length;
5337}
5338
5339/* Similar to PyUnicode_WRITE but either write into wstr field
5340 or treat string as ASCII. */
5341#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5342 do { \
5343 if ((kind) != PyUnicode_WCHAR_KIND) \
5344 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5345 else \
5346 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5347 } while (0)
5348
5349#define WRITE_WSTR(buf, index, value) \
5350 assert(kind == PyUnicode_WCHAR_KIND), \
5351 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5352
5353
Fredrik Lundh06d12682001-01-24 07:59:11 +00005354static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005355
Alexander Belopolsky40018472011-02-26 01:02:56 +00005356PyObject *
5357PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005358 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005359 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t startinpos;
5363 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005366 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005368 char* message;
5369 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 PyObject *errorHandler = NULL;
5371 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005372 Py_ssize_t ascii_length;
5373 Py_ssize_t i;
5374 int kind;
5375 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005377 ascii_length = length_of_escaped_ascii_string(s, size);
5378
5379 /* After length_of_escaped_ascii_string() there are two alternatives,
5380 either the string is pure ASCII with named escapes like \n, etc.
5381 and we determined it's exact size (common case)
5382 or it contains \x, \u, ... escape sequences. then we create a
5383 legacy wchar string and resize it at the end of this function. */
5384 if (ascii_length >= 0) {
5385 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5386 if (!v)
5387 goto onError;
5388 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5389 kind = PyUnicode_1BYTE_KIND;
5390 data = PyUnicode_DATA(v);
5391 }
5392 else {
5393 /* Escaped strings will always be longer than the resulting
5394 Unicode string, so we start with size here and then reduce the
5395 length after conversion to the true value.
5396 (but if the error callback returns a long replacement string
5397 we'll have to allocate more space) */
5398 v = _PyUnicode_New(size);
5399 if (!v)
5400 goto onError;
5401 kind = PyUnicode_WCHAR_KIND;
5402 data = PyUnicode_AS_UNICODE(v);
5403 }
5404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (size == 0)
5406 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005407 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005409
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 while (s < end) {
5411 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005412 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005413 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005415 if (kind == PyUnicode_WCHAR_KIND) {
5416 assert(i < _PyUnicode_WSTR_LENGTH(v));
5417 }
5418 else {
5419 /* The only case in which i == ascii_length is a backslash
5420 followed by a newline. */
5421 assert(i <= ascii_length);
5422 }
5423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 /* Non-escape characters are interpreted as Unicode ordinals */
5425 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 continue;
5428 }
5429
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005430 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 /* \ - Escapes */
5432 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005433 c = *s++;
5434 if (s > end)
5435 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436
5437 if (kind == PyUnicode_WCHAR_KIND) {
5438 assert(i < _PyUnicode_WSTR_LENGTH(v));
5439 }
5440 else {
5441 /* The only case in which i == ascii_length is a backslash
5442 followed by a newline. */
5443 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5444 }
5445
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005446 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5451 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5452 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5453 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5454 /* FF */
5455 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5456 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5457 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5458 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5459 /* VT */
5460 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5461 /* BEL, not classic C */
5462 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 case '0': case '1': case '2': case '3':
5466 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005467 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005468 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005469 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005470 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005471 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005473 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 break;
5475
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* hex escapes */
5477 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005479 digits = 2;
5480 message = "truncated \\xXX escape";
5481 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005485 digits = 4;
5486 message = "truncated \\uXXXX escape";
5487 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005490 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005491 digits = 8;
5492 message = "truncated \\UXXXXXXXX escape";
5493 hexescape:
5494 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005495 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 if (s+digits>end) {
5497 endinpos = size;
5498 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 errors, &errorHandler,
5500 "unicodeescape", "end of string in escape sequence",
5501 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005502 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005504 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 goto nextByte;
5506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005507 for (j = 0; j < digits; ++j) {
5508 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005509 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005510 endinpos = (s+j+1)-starts;
5511 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 errors, &errorHandler,
5514 "unicodeescape", message,
5515 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005517 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005520 }
5521 chr = (chr<<4) & ~0xF;
5522 if (c >= '0' && c <= '9')
5523 chr += c - '0';
5524 else if (c >= 'a' && c <= 'f')
5525 chr += 10 + c - 'a';
5526 else
5527 chr += 10 + c - 'A';
5528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005530 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005531 /* _decoding_error will have already written into the
5532 target buffer. */
5533 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005534 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005535 /* when we get here, chr is a 32-bit unicode character */
5536 if (chr <= 0xffff)
5537 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005539 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005540 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005541 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005542#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005544#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005545 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5547 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005548#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005549 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005552 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 errors, &errorHandler,
5554 "unicodeescape", "illegal Unicode character",
5555 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005556 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005557 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005559 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 break;
5561
Benjamin Peterson29060642009-01-31 22:14:21 +00005562 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 case 'N':
5564 message = "malformed \\N character escape";
5565 if (ucnhash_CAPI == NULL) {
5566 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5568 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005569 if (ucnhash_CAPI == NULL)
5570 goto ucnhashError;
5571 }
5572 if (*s == '{') {
5573 const char *start = s+1;
5574 /* look for the closing brace */
5575 while (*s != '}' && s < end)
5576 s++;
5577 if (s > start && s < end && *s == '}') {
5578 /* found a name. look it up in the unicode database */
5579 message = "unknown Unicode character name";
5580 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005581 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5582 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005583 goto store;
5584 }
5585 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005588 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 errors, &errorHandler,
5590 "unicodeescape", message,
5591 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005592 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005593 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005595 break;
5596
5597 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005598 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005599 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 message = "\\ at end of string";
5601 s--;
5602 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005604 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 errors, &errorHandler,
5606 "unicodeescape", message,
5607 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005609 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005611 }
5612 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5614 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005615 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005619 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 /* Ensure the length prediction worked in case of ASCII strings */
5622 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5623
Victor Stinnerfe226c02011-10-03 03:52:20 +02005624 if (kind == PyUnicode_WCHAR_KIND)
5625 {
5626 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5627 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005628 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005629 Py_XDECREF(errorHandler);
5630 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005631#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005632 if (_PyUnicode_READY_REPLACE(&v)) {
5633 Py_DECREF(v);
5634 return NULL;
5635 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005636#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005638
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005640 PyErr_SetString(
5641 PyExc_UnicodeError,
5642 "\\N escapes not supported (can't load unicodedata module)"
5643 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005644 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 Py_XDECREF(errorHandler);
5646 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005647 return NULL;
5648
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 Py_XDECREF(errorHandler);
5652 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 return NULL;
5654}
5655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656#undef WRITE_ASCII_OR_WSTR
5657#undef WRITE_WSTR
5658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659/* Return a Unicode-Escape string version of the Unicode object.
5660
5661 If quotes is true, the string is enclosed in u"" or u'' quotes as
5662 appropriate.
5663
5664*/
5665
Walter Dörwald79e913e2007-05-12 11:08:06 +00005666static const char *hexdigits = "0123456789abcdef";
5667
Alexander Belopolsky40018472011-02-26 01:02:56 +00005668PyObject *
5669PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005670 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005672 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005675#ifdef Py_UNICODE_WIDE
5676 const Py_ssize_t expandsize = 10;
5677#else
5678 const Py_ssize_t expandsize = 6;
5679#endif
5680
Thomas Wouters89f507f2006-12-13 04:49:30 +00005681 /* XXX(nnorwitz): rather than over-allocating, it would be
5682 better to choose a different scheme. Perhaps scan the
5683 first N-chars of the string and allocate based on that size.
5684 */
5685 /* Initial allocation is based on the longest-possible unichr
5686 escape.
5687
5688 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5689 unichr, so in this case it's the longest unichr escape. In
5690 narrow (UTF-16) builds this is five chars per source unichr
5691 since there are two unichrs in the surrogate pair, so in narrow
5692 (UTF-16) builds it's not the longest unichr escape.
5693
5694 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5695 so in the narrow (UTF-16) build case it's the longest unichr
5696 escape.
5697 */
5698
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005699 if (size == 0)
5700 return PyBytes_FromStringAndSize(NULL, 0);
5701
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005702 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005704
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005705 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005706 2
5707 + expandsize*size
5708 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 if (repr == NULL)
5710 return NULL;
5711
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005712 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 while (size-- > 0) {
5715 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005716
Walter Dörwald79e913e2007-05-12 11:08:06 +00005717 /* Escape backslashes */
5718 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 *p++ = '\\';
5720 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005721 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005722 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005723
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005724#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005725 /* Map 21-bit characters to '\U00xxxxxx' */
5726 else if (ch >= 0x10000) {
5727 *p++ = '\\';
5728 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005729 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5730 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5731 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5732 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5733 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5734 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5735 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5736 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005738 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005739#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5741 else if (ch >= 0xD800 && ch < 0xDC00) {
5742 Py_UNICODE ch2;
5743 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 ch2 = *s++;
5746 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005747 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5749 *p++ = '\\';
5750 *p++ = 'U';
5751 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5752 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5753 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5754 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5755 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5756 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5757 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5758 *p++ = hexdigits[ucs & 0x0000000F];
5759 continue;
5760 }
5761 /* Fall through: isolated surrogates are copied as-is */
5762 s--;
5763 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005764 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005765#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005766
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005768 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 *p++ = '\\';
5770 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005771 *p++ = hexdigits[(ch >> 12) & 0x000F];
5772 *p++ = hexdigits[(ch >> 8) & 0x000F];
5773 *p++ = hexdigits[(ch >> 4) & 0x000F];
5774 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005776
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005777 /* Map special whitespace to '\t', \n', '\r' */
5778 else if (ch == '\t') {
5779 *p++ = '\\';
5780 *p++ = 't';
5781 }
5782 else if (ch == '\n') {
5783 *p++ = '\\';
5784 *p++ = 'n';
5785 }
5786 else if (ch == '\r') {
5787 *p++ = '\\';
5788 *p++ = 'r';
5789 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005790
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005791 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005792 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005794 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005795 *p++ = hexdigits[(ch >> 4) & 0x000F];
5796 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005797 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005798
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 /* Copy everything else as-is */
5800 else
5801 *p++ = (char) ch;
5802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005804 assert(p - PyBytes_AS_STRING(repr) > 0);
5805 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5806 return NULL;
5807 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808}
5809
Alexander Belopolsky40018472011-02-26 01:02:56 +00005810PyObject *
5811PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005813 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 if (!PyUnicode_Check(unicode)) {
5815 PyErr_BadArgument();
5816 return NULL;
5817 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005818 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5819 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005820 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821}
5822
5823/* --- Raw Unicode Escape Codec ------------------------------------------- */
5824
Alexander Belopolsky40018472011-02-26 01:02:56 +00005825PyObject *
5826PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005827 Py_ssize_t size,
5828 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005831 Py_ssize_t startinpos;
5832 Py_ssize_t endinpos;
5833 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005835 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 const char *end;
5837 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 PyObject *errorHandler = NULL;
5839 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005840
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 /* Escaped strings will always be longer than the resulting
5842 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 length after conversion to the true value. (But decoding error
5844 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 v = _PyUnicode_New(size);
5846 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 end = s + size;
5852 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 unsigned char c;
5854 Py_UCS4 x;
5855 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005856 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 /* Non-escape characters are interpreted as Unicode ordinals */
5859 if (*s != '\\') {
5860 *p++ = (unsigned char)*s++;
5861 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005862 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 startinpos = s-starts;
5864
5865 /* \u-escapes are only interpreted iff the number of leading
5866 backslashes if odd */
5867 bs = s;
5868 for (;s < end;) {
5869 if (*s != '\\')
5870 break;
5871 *p++ = (unsigned char)*s++;
5872 }
5873 if (((s - bs) & 1) == 0 ||
5874 s >= end ||
5875 (*s != 'u' && *s != 'U')) {
5876 continue;
5877 }
5878 p--;
5879 count = *s=='u' ? 4 : 8;
5880 s++;
5881
5882 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5883 outpos = p-PyUnicode_AS_UNICODE(v);
5884 for (x = 0, i = 0; i < count; ++i, ++s) {
5885 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005886 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 endinpos = s-starts;
5888 if (unicode_decode_call_errorhandler(
5889 errors, &errorHandler,
5890 "rawunicodeescape", "truncated \\uXXXX",
5891 &starts, &end, &startinpos, &endinpos, &exc, &s,
5892 &v, &outpos, &p))
5893 goto onError;
5894 goto nextByte;
5895 }
5896 x = (x<<4) & ~0xF;
5897 if (c >= '0' && c <= '9')
5898 x += c - '0';
5899 else if (c >= 'a' && c <= 'f')
5900 x += 10 + c - 'a';
5901 else
5902 x += 10 + c - 'A';
5903 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005904 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 /* UCS-2 character */
5906 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005907 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 /* UCS-4 character. Either store directly, or as
5909 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005910#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005912#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 x -= 0x10000L;
5914 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5915 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005916#endif
5917 } else {
5918 endinpos = s-starts;
5919 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005920 if (unicode_decode_call_errorhandler(
5921 errors, &errorHandler,
5922 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 &starts, &end, &startinpos, &endinpos, &exc, &s,
5924 &v, &outpos, &p))
5925 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005926 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 nextByte:
5928 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005930 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005932 Py_XDECREF(errorHandler);
5933 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005934#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005935 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005936 Py_DECREF(v);
5937 return NULL;
5938 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005939#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005941
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 return NULL;
5947}
5948
Alexander Belopolsky40018472011-02-26 01:02:56 +00005949PyObject *
5950PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005951 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 char *p;
5955 char *q;
5956
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005957#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005958 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005959#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005960 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005961#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005962
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005963 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005966 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 if (repr == NULL)
5968 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005969 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005972 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 while (size-- > 0) {
5974 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 /* Map 32-bit characters to '\Uxxxxxxxx' */
5977 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005978 *p++ = '\\';
5979 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005980 *p++ = hexdigits[(ch >> 28) & 0xf];
5981 *p++ = hexdigits[(ch >> 24) & 0xf];
5982 *p++ = hexdigits[(ch >> 20) & 0xf];
5983 *p++ = hexdigits[(ch >> 16) & 0xf];
5984 *p++ = hexdigits[(ch >> 12) & 0xf];
5985 *p++ = hexdigits[(ch >> 8) & 0xf];
5986 *p++ = hexdigits[(ch >> 4) & 0xf];
5987 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005988 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005989 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005990#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5992 if (ch >= 0xD800 && ch < 0xDC00) {
5993 Py_UNICODE ch2;
5994 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005995
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 ch2 = *s++;
5997 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005998 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6000 *p++ = '\\';
6001 *p++ = 'U';
6002 *p++ = hexdigits[(ucs >> 28) & 0xf];
6003 *p++ = hexdigits[(ucs >> 24) & 0xf];
6004 *p++ = hexdigits[(ucs >> 20) & 0xf];
6005 *p++ = hexdigits[(ucs >> 16) & 0xf];
6006 *p++ = hexdigits[(ucs >> 12) & 0xf];
6007 *p++ = hexdigits[(ucs >> 8) & 0xf];
6008 *p++ = hexdigits[(ucs >> 4) & 0xf];
6009 *p++ = hexdigits[ucs & 0xf];
6010 continue;
6011 }
6012 /* Fall through: isolated surrogates are copied as-is */
6013 s--;
6014 size++;
6015 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006016#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* Map 16-bit characters to '\uxxxx' */
6018 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 *p++ = '\\';
6020 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006021 *p++ = hexdigits[(ch >> 12) & 0xf];
6022 *p++ = hexdigits[(ch >> 8) & 0xf];
6023 *p++ = hexdigits[(ch >> 4) & 0xf];
6024 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* Copy everything else as-is */
6027 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 *p++ = (char) ch;
6029 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006030 size = p - q;
6031
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006032 assert(size > 0);
6033 if (_PyBytes_Resize(&repr, size) < 0)
6034 return NULL;
6035 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Alexander Belopolsky40018472011-02-26 01:02:56 +00006038PyObject *
6039PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006041 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006043 PyErr_BadArgument();
6044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006046 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6047 PyUnicode_GET_SIZE(unicode));
6048
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006049 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006052/* --- Unicode Internal Codec ------------------------------------------- */
6053
Alexander Belopolsky40018472011-02-26 01:02:56 +00006054PyObject *
6055_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006056 Py_ssize_t size,
6057 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006058{
6059 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006060 Py_ssize_t startinpos;
6061 Py_ssize_t endinpos;
6062 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006063 PyUnicodeObject *v;
6064 Py_UNICODE *p;
6065 const char *end;
6066 const char *reason;
6067 PyObject *errorHandler = NULL;
6068 PyObject *exc = NULL;
6069
Neal Norwitzd43069c2006-01-08 01:12:10 +00006070#ifdef Py_UNICODE_WIDE
6071 Py_UNICODE unimax = PyUnicode_GetMax();
6072#endif
6073
Thomas Wouters89f507f2006-12-13 04:49:30 +00006074 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006075 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6076 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6079 as string was created with the old API. */
6080 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006082 p = PyUnicode_AS_UNICODE(v);
6083 end = s + size;
6084
6085 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006086 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006087 /* We have to sanity check the raw data, otherwise doom looms for
6088 some malformed UCS-4 data. */
6089 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006090#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006091 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006092#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006093 end-s < Py_UNICODE_SIZE
6094 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006095 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006096 startinpos = s - starts;
6097 if (end-s < Py_UNICODE_SIZE) {
6098 endinpos = end-starts;
6099 reason = "truncated input";
6100 }
6101 else {
6102 endinpos = s - starts + Py_UNICODE_SIZE;
6103 reason = "illegal code point (> 0x10FFFF)";
6104 }
6105 outpos = p - PyUnicode_AS_UNICODE(v);
6106 if (unicode_decode_call_errorhandler(
6107 errors, &errorHandler,
6108 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006109 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006110 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006111 goto onError;
6112 }
6113 }
6114 else {
6115 p++;
6116 s += Py_UNICODE_SIZE;
6117 }
6118 }
6119
Victor Stinnerfe226c02011-10-03 03:52:20 +02006120 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 goto onError;
6122 Py_XDECREF(errorHandler);
6123 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006124#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006125 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006126 Py_DECREF(v);
6127 return NULL;
6128 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006129#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006130 return (PyObject *)v;
6131
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006133 Py_XDECREF(v);
6134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
6136 return NULL;
6137}
6138
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139/* --- Latin-1 Codec ------------------------------------------------------ */
6140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
6142PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006143 Py_ssize_t size,
6144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006147 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148}
6149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006151static void
6152make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006153 const char *encoding,
6154 const Py_UNICODE *unicode, Py_ssize_t size,
6155 Py_ssize_t startpos, Py_ssize_t endpos,
6156 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 *exceptionObject = PyUnicodeEncodeError_Create(
6160 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
6162 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6164 goto onError;
6165 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6166 goto onError;
6167 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6168 goto onError;
6169 return;
6170 onError:
6171 Py_DECREF(*exceptionObject);
6172 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
6174}
6175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177static void
6178raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006179 const char *encoding,
6180 const Py_UNICODE *unicode, Py_ssize_t size,
6181 Py_ssize_t startpos, Py_ssize_t endpos,
6182 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183{
6184 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188}
6189
6190/* error handling callback helper:
6191 build arguments, call the callback and check the arguments,
6192 put the result into newpos and return the replacement string, which
6193 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006194static PyObject *
6195unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006196 PyObject **errorHandler,
6197 const char *encoding, const char *reason,
6198 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6199 Py_ssize_t startpos, Py_ssize_t endpos,
6200 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006201{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006202 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
6204 PyObject *restuple;
6205 PyObject *resunicode;
6206
6207 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211 }
6212
6213 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006215 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217
6218 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006220 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006223 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 Py_DECREF(restuple);
6225 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006227 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 &resunicode, newpos)) {
6229 Py_DECREF(restuple);
6230 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006231 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006232 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6233 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6234 Py_DECREF(restuple);
6235 return NULL;
6236 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006239 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6241 Py_DECREF(restuple);
6242 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006243 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 Py_INCREF(resunicode);
6245 Py_DECREF(restuple);
6246 return resunicode;
6247}
6248
Alexander Belopolsky40018472011-02-26 01:02:56 +00006249static PyObject *
6250unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006251 Py_ssize_t size,
6252 const char *errors,
6253 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254{
6255 /* output object */
6256 PyObject *res;
6257 /* pointers to the beginning and end+1 of input */
6258 const Py_UNICODE *startp = p;
6259 const Py_UNICODE *endp = p + size;
6260 /* pointer to the beginning of the unencodable characters */
6261 /* const Py_UNICODE *badp = NULL; */
6262 /* pointer into the output */
6263 char *str;
6264 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006265 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006266 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6267 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 PyObject *errorHandler = NULL;
6269 PyObject *exc = NULL;
6270 /* the following variable is used for caching string comparisons
6271 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6272 int known_errorHandler = -1;
6273
6274 /* allocate enough for a simple encoding without
6275 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006276 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006277 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006278 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006280 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006281 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006282 ressize = size;
6283
6284 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006286
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 /* can we encode this? */
6288 if (c<limit) {
6289 /* no overflow check, because we know that the space is enough */
6290 *str++ = (char)c;
6291 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 else {
6294 Py_ssize_t unicodepos = p-startp;
6295 Py_ssize_t requiredsize;
6296 PyObject *repunicode;
6297 Py_ssize_t repsize;
6298 Py_ssize_t newpos;
6299 Py_ssize_t respos;
6300 Py_UNICODE *uni2;
6301 /* startpos for collecting unencodable chars */
6302 const Py_UNICODE *collstart = p;
6303 const Py_UNICODE *collend = p;
6304 /* find all unecodable characters */
6305 while ((collend < endp) && ((*collend)>=limit))
6306 ++collend;
6307 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6308 if (known_errorHandler==-1) {
6309 if ((errors==NULL) || (!strcmp(errors, "strict")))
6310 known_errorHandler = 1;
6311 else if (!strcmp(errors, "replace"))
6312 known_errorHandler = 2;
6313 else if (!strcmp(errors, "ignore"))
6314 known_errorHandler = 3;
6315 else if (!strcmp(errors, "xmlcharrefreplace"))
6316 known_errorHandler = 4;
6317 else
6318 known_errorHandler = 0;
6319 }
6320 switch (known_errorHandler) {
6321 case 1: /* strict */
6322 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6323 goto onError;
6324 case 2: /* replace */
6325 while (collstart++<collend)
6326 *str++ = '?'; /* fall through */
6327 case 3: /* ignore */
6328 p = collend;
6329 break;
6330 case 4: /* xmlcharrefreplace */
6331 respos = str - PyBytes_AS_STRING(res);
6332 /* determine replacement size (temporarily (mis)uses p) */
6333 for (p = collstart, repsize = 0; p < collend; ++p) {
6334 if (*p<10)
6335 repsize += 2+1+1;
6336 else if (*p<100)
6337 repsize += 2+2+1;
6338 else if (*p<1000)
6339 repsize += 2+3+1;
6340 else if (*p<10000)
6341 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006342#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 else
6344 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006345#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 else if (*p<100000)
6347 repsize += 2+5+1;
6348 else if (*p<1000000)
6349 repsize += 2+6+1;
6350 else
6351 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006352#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 }
6354 requiredsize = respos+repsize+(endp-collend);
6355 if (requiredsize > ressize) {
6356 if (requiredsize<2*ressize)
6357 requiredsize = 2*ressize;
6358 if (_PyBytes_Resize(&res, requiredsize))
6359 goto onError;
6360 str = PyBytes_AS_STRING(res) + respos;
6361 ressize = requiredsize;
6362 }
6363 /* generate replacement (temporarily (mis)uses p) */
6364 for (p = collstart; p < collend; ++p) {
6365 str += sprintf(str, "&#%d;", (int)*p);
6366 }
6367 p = collend;
6368 break;
6369 default:
6370 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6371 encoding, reason, startp, size, &exc,
6372 collstart-startp, collend-startp, &newpos);
6373 if (repunicode == NULL)
6374 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006375 if (PyBytes_Check(repunicode)) {
6376 /* Directly copy bytes result to output. */
6377 repsize = PyBytes_Size(repunicode);
6378 if (repsize > 1) {
6379 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006380 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006381 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6382 Py_DECREF(repunicode);
6383 goto onError;
6384 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006385 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006386 ressize += repsize-1;
6387 }
6388 memcpy(str, PyBytes_AsString(repunicode), repsize);
6389 str += repsize;
6390 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006391 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006392 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 /* need more space? (at least enough for what we
6395 have+the replacement+the rest of the string, so
6396 we won't have to check space for encodable characters) */
6397 respos = str - PyBytes_AS_STRING(res);
6398 repsize = PyUnicode_GET_SIZE(repunicode);
6399 requiredsize = respos+repsize+(endp-collend);
6400 if (requiredsize > ressize) {
6401 if (requiredsize<2*ressize)
6402 requiredsize = 2*ressize;
6403 if (_PyBytes_Resize(&res, requiredsize)) {
6404 Py_DECREF(repunicode);
6405 goto onError;
6406 }
6407 str = PyBytes_AS_STRING(res) + respos;
6408 ressize = requiredsize;
6409 }
6410 /* check if there is anything unencodable in the replacement
6411 and copy it to the output */
6412 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6413 c = *uni2;
6414 if (c >= limit) {
6415 raise_encode_exception(&exc, encoding, startp, size,
6416 unicodepos, unicodepos+1, reason);
6417 Py_DECREF(repunicode);
6418 goto onError;
6419 }
6420 *str = (char)c;
6421 }
6422 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006423 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006424 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006425 }
6426 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006427 /* Resize if we allocated to much */
6428 size = str - PyBytes_AS_STRING(res);
6429 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006430 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006431 if (_PyBytes_Resize(&res, size) < 0)
6432 goto onError;
6433 }
6434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 Py_XDECREF(errorHandler);
6436 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006437 return res;
6438
6439 onError:
6440 Py_XDECREF(res);
6441 Py_XDECREF(errorHandler);
6442 Py_XDECREF(exc);
6443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444}
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446PyObject *
6447PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 Py_ssize_t size,
6449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452}
6453
Alexander Belopolsky40018472011-02-26 01:02:56 +00006454PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006455_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456{
6457 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 PyErr_BadArgument();
6459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006461 if (PyUnicode_READY(unicode) == -1)
6462 return NULL;
6463 /* Fast path: if it is a one-byte string, construct
6464 bytes object directly. */
6465 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6466 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6467 PyUnicode_GET_LENGTH(unicode));
6468 /* Non-Latin-1 characters present. Defer to above function to
6469 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006472 errors);
6473}
6474
6475PyObject*
6476PyUnicode_AsLatin1String(PyObject *unicode)
6477{
6478 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006479}
6480
6481/* --- 7-bit ASCII Codec -------------------------------------------------- */
6482
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483PyObject *
6484PyUnicode_DecodeASCII(const char *s,
6485 Py_ssize_t size,
6486 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006490 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006491 Py_ssize_t startinpos;
6492 Py_ssize_t endinpos;
6493 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006495 int has_error;
6496 const unsigned char *p = (const unsigned char *)s;
6497 const unsigned char *end = p + size;
6498 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 PyObject *errorHandler = NULL;
6500 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006503 if (size == 1 && (unsigned char)s[0] < 128)
6504 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006505
Victor Stinner702c7342011-10-05 13:50:52 +02006506 has_error = 0;
6507 while (p < end && !has_error) {
6508 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6509 an explanation. */
6510 if (!((size_t) p & LONG_PTR_MASK)) {
6511 /* Help register allocation */
6512 register const unsigned char *_p = p;
6513 while (_p < aligned_end) {
6514 unsigned long value = *(unsigned long *) _p;
6515 if (value & ASCII_CHAR_MASK) {
6516 has_error = 1;
6517 break;
6518 }
6519 _p += SIZEOF_LONG;
6520 }
6521 if (_p == end)
6522 break;
6523 if (has_error)
6524 break;
6525 p = _p;
6526 }
6527 if (*p & 0x80) {
6528 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006529 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006530 }
6531 else {
6532 ++p;
6533 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006534 }
Victor Stinner702c7342011-10-05 13:50:52 +02006535 if (!has_error)
6536 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006537
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 v = _PyUnicode_New(size);
6539 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006543 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 e = s + size;
6545 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 register unsigned char c = (unsigned char)*s;
6547 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006548 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 ++s;
6550 }
6551 else {
6552 startinpos = s-starts;
6553 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006554 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 if (unicode_decode_call_errorhandler(
6556 errors, &errorHandler,
6557 "ascii", "ordinal not in range(128)",
6558 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006559 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 goto onError;
6561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 }
Victor Stinner702c7342011-10-05 13:50:52 +02006563 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6564 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 Py_XDECREF(errorHandler);
6567 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006568#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006569 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006570 Py_DECREF(v);
6571 return NULL;
6572 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006573#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006575
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006578 Py_XDECREF(errorHandler);
6579 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 return NULL;
6581}
6582
Alexander Belopolsky40018472011-02-26 01:02:56 +00006583PyObject *
6584PyUnicode_EncodeASCII(const Py_UNICODE *p,
6585 Py_ssize_t size,
6586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006588 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589}
6590
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006592_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593{
6594 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 PyErr_BadArgument();
6596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006598 if (PyUnicode_READY(unicode) == -1)
6599 return NULL;
6600 /* Fast path: if it is an ASCII-only string, construct bytes object
6601 directly. Else defer to above function to raise the exception. */
6602 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6603 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6604 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006607 errors);
6608}
6609
6610PyObject *
6611PyUnicode_AsASCIIString(PyObject *unicode)
6612{
6613 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614}
6615
Victor Stinner99b95382011-07-04 14:23:54 +02006616#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006617
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006618/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006619
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006620#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006621#define NEED_RETRY
6622#endif
6623
6624/* XXX This code is limited to "true" double-byte encodings, as
6625 a) it assumes an incomplete character consists of a single byte, and
6626 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006628
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629static int
6630is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006631{
6632 const char *curr = s + offset;
6633
6634 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 const char *prev = CharPrev(s, curr);
6636 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006637 }
6638 return 0;
6639}
6640
6641/*
6642 * Decode MBCS string into unicode object. If 'final' is set, converts
6643 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6644 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006645static int
6646decode_mbcs(PyUnicodeObject **v,
6647 const char *s, /* MBCS string */
6648 int size, /* sizeof MBCS string */
6649 int final,
6650 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651{
6652 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006653 Py_ssize_t n;
6654 DWORD usize;
6655 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006656
6657 assert(size >= 0);
6658
Victor Stinner554f3f02010-06-16 23:33:54 +00006659 /* check and handle 'errors' arg */
6660 if (errors==NULL || strcmp(errors, "strict")==0)
6661 flags = MB_ERR_INVALID_CHARS;
6662 else if (strcmp(errors, "ignore")==0)
6663 flags = 0;
6664 else {
6665 PyErr_Format(PyExc_ValueError,
6666 "mbcs encoding does not support errors='%s'",
6667 errors);
6668 return -1;
6669 }
6670
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006671 /* Skip trailing lead-byte unless 'final' is set */
6672 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006674
6675 /* First get the size of the result */
6676 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006677 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6678 if (usize==0)
6679 goto mbcs_decode_error;
6680 } else
6681 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006682
6683 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 /* Create unicode object */
6685 *v = _PyUnicode_New(usize);
6686 if (*v == NULL)
6687 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006688 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006689 }
6690 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 /* Extend unicode object */
6692 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006693 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006695 }
6696
6697 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006698 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006700 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6701 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006704 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006705
6706mbcs_decode_error:
6707 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6708 we raise a UnicodeDecodeError - else it is a 'generic'
6709 windows error
6710 */
6711 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6712 /* Ideally, we should get reason from FormatMessage - this
6713 is the Windows 2000 English version of the message
6714 */
6715 PyObject *exc = NULL;
6716 const char *reason = "No mapping for the Unicode character exists "
6717 "in the target multi-byte code page.";
6718 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6719 if (exc != NULL) {
6720 PyCodec_StrictErrors(exc);
6721 Py_DECREF(exc);
6722 }
6723 } else {
6724 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6725 }
6726 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727}
6728
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729PyObject *
6730PyUnicode_DecodeMBCSStateful(const char *s,
6731 Py_ssize_t size,
6732 const char *errors,
6733 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006734{
6735 PyUnicodeObject *v = NULL;
6736 int done;
6737
6738 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740
6741#ifdef NEED_RETRY
6742 retry:
6743 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745 else
6746#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006747 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006748
6749 if (done < 0) {
6750 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752 }
6753
6754 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756
6757#ifdef NEED_RETRY
6758 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 s += done;
6760 size -= done;
6761 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762 }
6763#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006764#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006765 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006766 Py_DECREF(v);
6767 return NULL;
6768 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006769#endif
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770 return (PyObject *)v;
6771}
6772
Alexander Belopolsky40018472011-02-26 01:02:56 +00006773PyObject *
6774PyUnicode_DecodeMBCS(const char *s,
6775 Py_ssize_t size,
6776 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006777{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006778 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6779}
6780
6781/*
6782 * Convert unicode into string object (MBCS).
6783 * Returns 0 if succeed, -1 otherwise.
6784 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006785static int
6786encode_mbcs(PyObject **repr,
6787 const Py_UNICODE *p, /* unicode */
6788 int size, /* size of unicode */
6789 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790{
Victor Stinner554f3f02010-06-16 23:33:54 +00006791 BOOL usedDefaultChar = FALSE;
6792 BOOL *pusedDefaultChar;
6793 int mbcssize;
6794 Py_ssize_t n;
6795 PyObject *exc = NULL;
6796 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797
6798 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006799
Victor Stinner554f3f02010-06-16 23:33:54 +00006800 /* check and handle 'errors' arg */
6801 if (errors==NULL || strcmp(errors, "strict")==0) {
6802 flags = WC_NO_BEST_FIT_CHARS;
6803 pusedDefaultChar = &usedDefaultChar;
6804 } else if (strcmp(errors, "replace")==0) {
6805 flags = 0;
6806 pusedDefaultChar = NULL;
6807 } else {
6808 PyErr_Format(PyExc_ValueError,
6809 "mbcs encoding does not support errors='%s'",
6810 errors);
6811 return -1;
6812 }
6813
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006814 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006815 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006816 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6817 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 if (mbcssize == 0) {
6819 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6820 return -1;
6821 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006822 /* If we used a default char, then we failed! */
6823 if (pusedDefaultChar && *pusedDefaultChar)
6824 goto mbcs_encode_error;
6825 } else {
6826 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006827 }
6828
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006829 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 /* Create string object */
6831 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6832 if (*repr == NULL)
6833 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006834 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835 }
6836 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 /* Extend string object */
6838 n = PyBytes_Size(*repr);
6839 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6840 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 }
6842
6843 /* Do the conversion */
6844 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006846 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6847 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6849 return -1;
6850 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006851 if (pusedDefaultChar && *pusedDefaultChar)
6852 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006854 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006855
6856mbcs_encode_error:
6857 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6858 Py_XDECREF(exc);
6859 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006860}
6861
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862PyObject *
6863PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6864 Py_ssize_t size,
6865 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006866{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867 PyObject *repr = NULL;
6868 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006869
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006872 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006873 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874 else
6875#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006876 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006877
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006878 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 Py_XDECREF(repr);
6880 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006881 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006882
6883#ifdef NEED_RETRY
6884 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 p += INT_MAX;
6886 size -= INT_MAX;
6887 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 }
6889#endif
6890
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006891 return repr;
6892}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006893
Alexander Belopolsky40018472011-02-26 01:02:56 +00006894PyObject *
6895PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006896{
6897 if (!PyUnicode_Check(unicode)) {
6898 PyErr_BadArgument();
6899 return NULL;
6900 }
6901 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 PyUnicode_GET_SIZE(unicode),
6903 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006904}
6905
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906#undef NEED_RETRY
6907
Victor Stinner99b95382011-07-04 14:23:54 +02006908#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006909
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910/* --- Character Mapping Codec -------------------------------------------- */
6911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912PyObject *
6913PyUnicode_DecodeCharmap(const char *s,
6914 Py_ssize_t size,
6915 PyObject *mapping,
6916 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006918 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006919 Py_ssize_t startinpos;
6920 Py_ssize_t endinpos;
6921 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923 PyUnicodeObject *v;
6924 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006925 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006926 PyObject *errorHandler = NULL;
6927 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006928 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006929 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006930
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 /* Default to Latin-1 */
6932 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935 v = _PyUnicode_New(size);
6936 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006942 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 mapstring = PyUnicode_AS_UNICODE(mapping);
6944 maplen = PyUnicode_GET_SIZE(mapping);
6945 while (s < e) {
6946 unsigned char ch = *s;
6947 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 if (ch < maplen)
6950 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 if (x == 0xfffe) {
6953 /* undefined mapping */
6954 outpos = p-PyUnicode_AS_UNICODE(v);
6955 startinpos = s-starts;
6956 endinpos = startinpos+1;
6957 if (unicode_decode_call_errorhandler(
6958 errors, &errorHandler,
6959 "charmap", "character maps to <undefined>",
6960 &starts, &e, &startinpos, &endinpos, &exc, &s,
6961 &v, &outpos, &p)) {
6962 goto onError;
6963 }
6964 continue;
6965 }
6966 *p++ = x;
6967 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006968 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006969 }
6970 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006971 while (s < e) {
6972 unsigned char ch = *s;
6973 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006974
Benjamin Peterson29060642009-01-31 22:14:21 +00006975 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6976 w = PyLong_FromLong((long)ch);
6977 if (w == NULL)
6978 goto onError;
6979 x = PyObject_GetItem(mapping, w);
6980 Py_DECREF(w);
6981 if (x == NULL) {
6982 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6983 /* No mapping found means: mapping is undefined. */
6984 PyErr_Clear();
6985 x = Py_None;
6986 Py_INCREF(x);
6987 } else
6988 goto onError;
6989 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006990
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 /* Apply mapping */
6992 if (PyLong_Check(x)) {
6993 long value = PyLong_AS_LONG(x);
6994 if (value < 0 || value > 65535) {
6995 PyErr_SetString(PyExc_TypeError,
6996 "character mapping must be in range(65536)");
6997 Py_DECREF(x);
6998 goto onError;
6999 }
7000 *p++ = (Py_UNICODE)value;
7001 }
7002 else if (x == Py_None) {
7003 /* undefined mapping */
7004 outpos = p-PyUnicode_AS_UNICODE(v);
7005 startinpos = s-starts;
7006 endinpos = startinpos+1;
7007 if (unicode_decode_call_errorhandler(
7008 errors, &errorHandler,
7009 "charmap", "character maps to <undefined>",
7010 &starts, &e, &startinpos, &endinpos, &exc, &s,
7011 &v, &outpos, &p)) {
7012 Py_DECREF(x);
7013 goto onError;
7014 }
7015 Py_DECREF(x);
7016 continue;
7017 }
7018 else if (PyUnicode_Check(x)) {
7019 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007020
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 if (targetsize == 1)
7022 /* 1-1 mapping */
7023 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007024
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 else if (targetsize > 1) {
7026 /* 1-n mapping */
7027 if (targetsize > extrachars) {
7028 /* resize first */
7029 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7030 Py_ssize_t needed = (targetsize - extrachars) + \
7031 (targetsize << 2);
7032 extrachars += needed;
7033 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007034 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 PyUnicode_GET_SIZE(v) + needed) < 0) {
7036 Py_DECREF(x);
7037 goto onError;
7038 }
7039 p = PyUnicode_AS_UNICODE(v) + oldpos;
7040 }
7041 Py_UNICODE_COPY(p,
7042 PyUnicode_AS_UNICODE(x),
7043 targetsize);
7044 p += targetsize;
7045 extrachars -= targetsize;
7046 }
7047 /* 1-0 mapping: skip the character */
7048 }
7049 else {
7050 /* wrong return value */
7051 PyErr_SetString(PyExc_TypeError,
7052 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007053 Py_DECREF(x);
7054 goto onError;
7055 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 Py_DECREF(x);
7057 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 }
7060 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007061 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007063 Py_XDECREF(errorHandler);
7064 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007065#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007066 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007067 Py_DECREF(v);
7068 return NULL;
7069 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007070#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007072
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 Py_XDECREF(errorHandler);
7075 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 Py_XDECREF(v);
7077 return NULL;
7078}
7079
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007080/* Charmap encoding: the lookup table */
7081
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 PyObject_HEAD
7084 unsigned char level1[32];
7085 int count2, count3;
7086 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007087};
7088
7089static PyObject*
7090encoding_map_size(PyObject *obj, PyObject* args)
7091{
7092 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007093 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007095}
7096
7097static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007098 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 PyDoc_STR("Return the size (in bytes) of this object") },
7100 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007101};
7102
7103static void
7104encoding_map_dealloc(PyObject* o)
7105{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007106 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007107}
7108
7109static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007110 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 "EncodingMap", /*tp_name*/
7112 sizeof(struct encoding_map), /*tp_basicsize*/
7113 0, /*tp_itemsize*/
7114 /* methods */
7115 encoding_map_dealloc, /*tp_dealloc*/
7116 0, /*tp_print*/
7117 0, /*tp_getattr*/
7118 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007119 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 0, /*tp_repr*/
7121 0, /*tp_as_number*/
7122 0, /*tp_as_sequence*/
7123 0, /*tp_as_mapping*/
7124 0, /*tp_hash*/
7125 0, /*tp_call*/
7126 0, /*tp_str*/
7127 0, /*tp_getattro*/
7128 0, /*tp_setattro*/
7129 0, /*tp_as_buffer*/
7130 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7131 0, /*tp_doc*/
7132 0, /*tp_traverse*/
7133 0, /*tp_clear*/
7134 0, /*tp_richcompare*/
7135 0, /*tp_weaklistoffset*/
7136 0, /*tp_iter*/
7137 0, /*tp_iternext*/
7138 encoding_map_methods, /*tp_methods*/
7139 0, /*tp_members*/
7140 0, /*tp_getset*/
7141 0, /*tp_base*/
7142 0, /*tp_dict*/
7143 0, /*tp_descr_get*/
7144 0, /*tp_descr_set*/
7145 0, /*tp_dictoffset*/
7146 0, /*tp_init*/
7147 0, /*tp_alloc*/
7148 0, /*tp_new*/
7149 0, /*tp_free*/
7150 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007151};
7152
7153PyObject*
7154PyUnicode_BuildEncodingMap(PyObject* string)
7155{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007156 PyObject *result;
7157 struct encoding_map *mresult;
7158 int i;
7159 int need_dict = 0;
7160 unsigned char level1[32];
7161 unsigned char level2[512];
7162 unsigned char *mlevel1, *mlevel2, *mlevel3;
7163 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007164 int kind;
7165 void *data;
7166 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007168 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007169 PyErr_BadArgument();
7170 return NULL;
7171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007172 kind = PyUnicode_KIND(string);
7173 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007174 memset(level1, 0xFF, sizeof level1);
7175 memset(level2, 0xFF, sizeof level2);
7176
7177 /* If there isn't a one-to-one mapping of NULL to \0,
7178 or if there are non-BMP characters, we need to use
7179 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007180 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007181 need_dict = 1;
7182 for (i = 1; i < 256; i++) {
7183 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007184 ch = PyUnicode_READ(kind, data, i);
7185 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007186 need_dict = 1;
7187 break;
7188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007189 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007190 /* unmapped character */
7191 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007192 l1 = ch >> 11;
7193 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007194 if (level1[l1] == 0xFF)
7195 level1[l1] = count2++;
7196 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007197 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007198 }
7199
7200 if (count2 >= 0xFF || count3 >= 0xFF)
7201 need_dict = 1;
7202
7203 if (need_dict) {
7204 PyObject *result = PyDict_New();
7205 PyObject *key, *value;
7206 if (!result)
7207 return NULL;
7208 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007209 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007210 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007211 if (!key || !value)
7212 goto failed1;
7213 if (PyDict_SetItem(result, key, value) == -1)
7214 goto failed1;
7215 Py_DECREF(key);
7216 Py_DECREF(value);
7217 }
7218 return result;
7219 failed1:
7220 Py_XDECREF(key);
7221 Py_XDECREF(value);
7222 Py_DECREF(result);
7223 return NULL;
7224 }
7225
7226 /* Create a three-level trie */
7227 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7228 16*count2 + 128*count3 - 1);
7229 if (!result)
7230 return PyErr_NoMemory();
7231 PyObject_Init(result, &EncodingMapType);
7232 mresult = (struct encoding_map*)result;
7233 mresult->count2 = count2;
7234 mresult->count3 = count3;
7235 mlevel1 = mresult->level1;
7236 mlevel2 = mresult->level23;
7237 mlevel3 = mresult->level23 + 16*count2;
7238 memcpy(mlevel1, level1, 32);
7239 memset(mlevel2, 0xFF, 16*count2);
7240 memset(mlevel3, 0, 128*count3);
7241 count3 = 0;
7242 for (i = 1; i < 256; i++) {
7243 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007244 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007245 /* unmapped character */
7246 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007247 o1 = PyUnicode_READ(kind, data, i)>>11;
7248 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007249 i2 = 16*mlevel1[o1] + o2;
7250 if (mlevel2[i2] == 0xFF)
7251 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007252 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007253 i3 = 128*mlevel2[i2] + o3;
7254 mlevel3[i3] = i;
7255 }
7256 return result;
7257}
7258
7259static int
7260encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7261{
7262 struct encoding_map *map = (struct encoding_map*)mapping;
7263 int l1 = c>>11;
7264 int l2 = (c>>7) & 0xF;
7265 int l3 = c & 0x7F;
7266 int i;
7267
7268#ifdef Py_UNICODE_WIDE
7269 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 }
7272#endif
7273 if (c == 0)
7274 return 0;
7275 /* level 1*/
7276 i = map->level1[l1];
7277 if (i == 0xFF) {
7278 return -1;
7279 }
7280 /* level 2*/
7281 i = map->level23[16*i+l2];
7282 if (i == 0xFF) {
7283 return -1;
7284 }
7285 /* level 3 */
7286 i = map->level23[16*map->count2 + 128*i + l3];
7287 if (i == 0) {
7288 return -1;
7289 }
7290 return i;
7291}
7292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007293/* Lookup the character ch in the mapping. If the character
7294 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007295 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007296static PyObject *
7297charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298{
Christian Heimes217cfd12007-12-02 14:31:20 +00007299 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007300 PyObject *x;
7301
7302 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007304 x = PyObject_GetItem(mapping, w);
7305 Py_DECREF(w);
7306 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007307 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7308 /* No mapping found means: mapping is undefined. */
7309 PyErr_Clear();
7310 x = Py_None;
7311 Py_INCREF(x);
7312 return x;
7313 } else
7314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007316 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007317 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007318 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 long value = PyLong_AS_LONG(x);
7320 if (value < 0 || value > 255) {
7321 PyErr_SetString(PyExc_TypeError,
7322 "character mapping must be in range(256)");
7323 Py_DECREF(x);
7324 return NULL;
7325 }
7326 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007328 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 /* wrong return value */
7332 PyErr_Format(PyExc_TypeError,
7333 "character mapping must return integer, bytes or None, not %.400s",
7334 x->ob_type->tp_name);
7335 Py_DECREF(x);
7336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 }
7338}
7339
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007340static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007341charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007342{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7344 /* exponentially overallocate to minimize reallocations */
7345 if (requiredsize < 2*outsize)
7346 requiredsize = 2*outsize;
7347 if (_PyBytes_Resize(outobj, requiredsize))
7348 return -1;
7349 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007350}
7351
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007355/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007356 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007357 space is available. Return a new reference to the object that
7358 was put in the output buffer, or Py_None, if the mapping was undefined
7359 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007360 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007361static charmapencode_result
7362charmapencode_output(Py_UNICODE c, PyObject *mapping,
7363 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007364{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007365 PyObject *rep;
7366 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007367 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007368
Christian Heimes90aa7642007-12-19 02:45:37 +00007369 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007370 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007372 if (res == -1)
7373 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 if (outsize<requiredsize)
7375 if (charmapencode_resize(outobj, outpos, requiredsize))
7376 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007377 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 outstart[(*outpos)++] = (char)res;
7379 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007380 }
7381
7382 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007383 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007385 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 Py_DECREF(rep);
7387 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007388 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007389 if (PyLong_Check(rep)) {
7390 Py_ssize_t requiredsize = *outpos+1;
7391 if (outsize<requiredsize)
7392 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7393 Py_DECREF(rep);
7394 return enc_EXCEPTION;
7395 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007396 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007398 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007399 else {
7400 const char *repchars = PyBytes_AS_STRING(rep);
7401 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7402 Py_ssize_t requiredsize = *outpos+repsize;
7403 if (outsize<requiredsize)
7404 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7405 Py_DECREF(rep);
7406 return enc_EXCEPTION;
7407 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007408 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 memcpy(outstart + *outpos, repchars, repsize);
7410 *outpos += repsize;
7411 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007413 Py_DECREF(rep);
7414 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007415}
7416
7417/* handle an error in PyUnicode_EncodeCharmap
7418 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007419static int
7420charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007421 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007422 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007423 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007424 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007425{
7426 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007427 Py_ssize_t repsize;
7428 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007429 Py_UNICODE *uni2;
7430 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007431 Py_ssize_t collstartpos = *inpos;
7432 Py_ssize_t collendpos = *inpos+1;
7433 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007434 char *encoding = "charmap";
7435 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007436 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007438 /* find all unencodable characters */
7439 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007440 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007441 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007442 int res = encoding_map_lookup(p[collendpos], mapping);
7443 if (res != -1)
7444 break;
7445 ++collendpos;
7446 continue;
7447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007448
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 rep = charmapencode_lookup(p[collendpos], mapping);
7450 if (rep==NULL)
7451 return -1;
7452 else if (rep!=Py_None) {
7453 Py_DECREF(rep);
7454 break;
7455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007456 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007458 }
7459 /* cache callback name lookup
7460 * (if not done yet, i.e. it's the first error) */
7461 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 if ((errors==NULL) || (!strcmp(errors, "strict")))
7463 *known_errorHandler = 1;
7464 else if (!strcmp(errors, "replace"))
7465 *known_errorHandler = 2;
7466 else if (!strcmp(errors, "ignore"))
7467 *known_errorHandler = 3;
7468 else if (!strcmp(errors, "xmlcharrefreplace"))
7469 *known_errorHandler = 4;
7470 else
7471 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472 }
7473 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007474 case 1: /* strict */
7475 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7476 return -1;
7477 case 2: /* replace */
7478 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 x = charmapencode_output('?', mapping, res, respos);
7480 if (x==enc_EXCEPTION) {
7481 return -1;
7482 }
7483 else if (x==enc_FAILED) {
7484 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7485 return -1;
7486 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007487 }
7488 /* fall through */
7489 case 3: /* ignore */
7490 *inpos = collendpos;
7491 break;
7492 case 4: /* xmlcharrefreplace */
7493 /* generate replacement (temporarily (mis)uses p) */
7494 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 char buffer[2+29+1+1];
7496 char *cp;
7497 sprintf(buffer, "&#%d;", (int)p[collpos]);
7498 for (cp = buffer; *cp; ++cp) {
7499 x = charmapencode_output(*cp, mapping, res, respos);
7500 if (x==enc_EXCEPTION)
7501 return -1;
7502 else if (x==enc_FAILED) {
7503 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7504 return -1;
7505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007506 }
7507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007508 *inpos = collendpos;
7509 break;
7510 default:
7511 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 encoding, reason, p, size, exceptionObject,
7513 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007514 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007516 if (PyBytes_Check(repunicode)) {
7517 /* Directly copy bytes result to output. */
7518 Py_ssize_t outsize = PyBytes_Size(*res);
7519 Py_ssize_t requiredsize;
7520 repsize = PyBytes_Size(repunicode);
7521 requiredsize = *respos + repsize;
7522 if (requiredsize > outsize)
7523 /* Make room for all additional bytes. */
7524 if (charmapencode_resize(res, respos, requiredsize)) {
7525 Py_DECREF(repunicode);
7526 return -1;
7527 }
7528 memcpy(PyBytes_AsString(*res) + *respos,
7529 PyBytes_AsString(repunicode), repsize);
7530 *respos += repsize;
7531 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007532 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007533 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007535 /* generate replacement */
7536 repsize = PyUnicode_GET_SIZE(repunicode);
7537 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 x = charmapencode_output(*uni2, mapping, res, respos);
7539 if (x==enc_EXCEPTION) {
7540 return -1;
7541 }
7542 else if (x==enc_FAILED) {
7543 Py_DECREF(repunicode);
7544 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7545 return -1;
7546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007547 }
7548 *inpos = newpos;
7549 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 }
7551 return 0;
7552}
7553
Alexander Belopolsky40018472011-02-26 01:02:56 +00007554PyObject *
7555PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7556 Py_ssize_t size,
7557 PyObject *mapping,
7558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007560 /* output object */
7561 PyObject *res = NULL;
7562 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007563 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007564 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007565 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007566 PyObject *errorHandler = NULL;
7567 PyObject *exc = NULL;
7568 /* the following variable is used for caching string comparisons
7569 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7570 * 3=ignore, 4=xmlcharrefreplace */
7571 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
7573 /* Default to Latin-1 */
7574 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 /* allocate enough for a simple encoding without
7578 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007579 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 if (res == NULL)
7581 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007582 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 /* try to encode it */
7587 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7588 if (x==enc_EXCEPTION) /* error */
7589 goto onError;
7590 if (x==enc_FAILED) { /* unencodable character */
7591 if (charmap_encoding_error(p, size, &inpos, mapping,
7592 &exc,
7593 &known_errorHandler, &errorHandler, errors,
7594 &res, &respos)) {
7595 goto onError;
7596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 else
7599 /* done with this character => adjust input position */
7600 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007603 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007604 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007605 if (_PyBytes_Resize(&res, respos) < 0)
7606 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608 Py_XDECREF(exc);
7609 Py_XDECREF(errorHandler);
7610 return res;
7611
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007613 Py_XDECREF(res);
7614 Py_XDECREF(exc);
7615 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 return NULL;
7617}
7618
Alexander Belopolsky40018472011-02-26 01:02:56 +00007619PyObject *
7620PyUnicode_AsCharmapString(PyObject *unicode,
7621 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622{
7623 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 PyErr_BadArgument();
7625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 }
7627 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 PyUnicode_GET_SIZE(unicode),
7629 mapping,
7630 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631}
7632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007634static void
7635make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007636 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637 Py_ssize_t startpos, Py_ssize_t endpos,
7638 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007641 *exceptionObject = _PyUnicodeTranslateError_Create(
7642 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643 }
7644 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7646 goto onError;
7647 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7648 goto onError;
7649 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7650 goto onError;
7651 return;
7652 onError:
7653 Py_DECREF(*exceptionObject);
7654 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 }
7656}
7657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007659static void
7660raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007661 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007662 Py_ssize_t startpos, Py_ssize_t endpos,
7663 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007664{
7665 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007669}
7670
7671/* error handling callback helper:
7672 build arguments, call the callback and check the arguments,
7673 put the result into newpos and return the replacement string, which
7674 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007675static PyObject *
7676unicode_translate_call_errorhandler(const char *errors,
7677 PyObject **errorHandler,
7678 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007679 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007680 Py_ssize_t startpos, Py_ssize_t endpos,
7681 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007682{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007683 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007684
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007685 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007686 PyObject *restuple;
7687 PyObject *resunicode;
7688
7689 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007693 }
7694
7695 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007696 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699
7700 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007704 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007705 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 Py_DECREF(restuple);
7707 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007708 }
7709 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 &resunicode, &i_newpos)) {
7711 Py_DECREF(restuple);
7712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007714 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716 else
7717 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007718 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7720 Py_DECREF(restuple);
7721 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 Py_INCREF(resunicode);
7724 Py_DECREF(restuple);
7725 return resunicode;
7726}
7727
7728/* Lookup the character ch in the mapping and put the result in result,
7729 which must be decrefed by the caller.
7730 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007731static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007733{
Christian Heimes217cfd12007-12-02 14:31:20 +00007734 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007735 PyObject *x;
7736
7737 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739 x = PyObject_GetItem(mapping, w);
7740 Py_DECREF(w);
7741 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7743 /* No mapping found means: use 1:1 mapping. */
7744 PyErr_Clear();
7745 *result = NULL;
7746 return 0;
7747 } else
7748 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 }
7750 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 *result = x;
7752 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007754 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 long value = PyLong_AS_LONG(x);
7756 long max = PyUnicode_GetMax();
7757 if (value < 0 || value > max) {
7758 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007759 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 Py_DECREF(x);
7761 return -1;
7762 }
7763 *result = x;
7764 return 0;
7765 }
7766 else if (PyUnicode_Check(x)) {
7767 *result = x;
7768 return 0;
7769 }
7770 else {
7771 /* wrong return value */
7772 PyErr_SetString(PyExc_TypeError,
7773 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 Py_DECREF(x);
7775 return -1;
7776 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007777}
7778/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 if not reallocate and adjust various state variables.
7780 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007781static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007782charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007785 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007786 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 /* exponentially overallocate to minimize reallocations */
7788 if (requiredsize < 2 * oldsize)
7789 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7791 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794 }
7795 return 0;
7796}
7797/* lookup the character, put the result in the output string and adjust
7798 various state variables. Return a new reference to the object that
7799 was put in the output buffer in *result, or Py_None, if the mapping was
7800 undefined (in which case no character was written).
7801 The called must decref result.
7802 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007803static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7805 PyObject *mapping, Py_UCS4 **output,
7806 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007807 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007809 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7810 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007812 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007814 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007815 }
7816 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007817 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007818 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 }
7822 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 Py_ssize_t repsize;
7824 if (PyUnicode_READY(*res) == -1)
7825 return -1;
7826 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 if (repsize==1) {
7828 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007829 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 }
7831 else if (repsize!=0) {
7832 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007833 Py_ssize_t requiredsize = *opos +
7834 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 Py_ssize_t i;
7837 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007839 for(i = 0; i < repsize; i++)
7840 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007842 }
7843 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845 return 0;
7846}
7847
Alexander Belopolsky40018472011-02-26 01:02:56 +00007848PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849_PyUnicode_TranslateCharmap(PyObject *input,
7850 PyObject *mapping,
7851 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007853 /* input object */
7854 char *idata;
7855 Py_ssize_t size, i;
7856 int kind;
7857 /* output buffer */
7858 Py_UCS4 *output = NULL;
7859 Py_ssize_t osize;
7860 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 char *reason = "character maps to <undefined>";
7864 PyObject *errorHandler = NULL;
7865 PyObject *exc = NULL;
7866 /* the following variable is used for caching string comparisons
7867 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7868 * 3=ignore, 4=xmlcharrefreplace */
7869 int known_errorHandler = -1;
7870
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 PyErr_BadArgument();
7873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007876 if (PyUnicode_READY(input) == -1)
7877 return NULL;
7878 idata = (char*)PyUnicode_DATA(input);
7879 kind = PyUnicode_KIND(input);
7880 size = PyUnicode_GET_LENGTH(input);
7881 i = 0;
7882
7883 if (size == 0) {
7884 Py_INCREF(input);
7885 return input;
7886 }
7887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 /* allocate enough for a simple 1:1 translation without
7889 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 osize = size;
7891 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7892 opos = 0;
7893 if (output == NULL) {
7894 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007896 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007898 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 /* try to encode it */
7900 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 if (charmaptranslate_output(input, i, mapping,
7902 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 Py_XDECREF(x);
7904 goto onError;
7905 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007906 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 else { /* untranslatable character */
7910 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7911 Py_ssize_t repsize;
7912 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007915 Py_ssize_t collstart = i;
7916 Py_ssize_t collend = i+1;
7917 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 while (collend < size) {
7921 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 goto onError;
7923 Py_XDECREF(x);
7924 if (x!=Py_None)
7925 break;
7926 ++collend;
7927 }
7928 /* cache callback name lookup
7929 * (if not done yet, i.e. it's the first error) */
7930 if (known_errorHandler==-1) {
7931 if ((errors==NULL) || (!strcmp(errors, "strict")))
7932 known_errorHandler = 1;
7933 else if (!strcmp(errors, "replace"))
7934 known_errorHandler = 2;
7935 else if (!strcmp(errors, "ignore"))
7936 known_errorHandler = 3;
7937 else if (!strcmp(errors, "xmlcharrefreplace"))
7938 known_errorHandler = 4;
7939 else
7940 known_errorHandler = 0;
7941 }
7942 switch (known_errorHandler) {
7943 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007944 raise_translate_exception(&exc, input, collstart,
7945 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007946 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 case 2: /* replace */
7948 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 for (coll = collstart; coll<collend; coll++)
7950 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 /* fall through */
7952 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007953 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 break;
7955 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 /* generate replacement (temporarily (mis)uses i) */
7957 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 char buffer[2+29+1+1];
7959 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007960 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7961 if (charmaptranslate_makespace(&output, &osize,
7962 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 goto onError;
7964 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007965 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007967 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 break;
7969 default:
7970 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 reason, input, &exc,
7972 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007973 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 goto onError;
7975 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007976 repsize = PyUnicode_GET_LENGTH(repunicode);
7977 if (charmaptranslate_makespace(&output, &osize,
7978 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 Py_DECREF(repunicode);
7980 goto onError;
7981 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 for (uni2 = 0; repsize-->0; ++uni2)
7983 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7984 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 }
7988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7990 if (!res)
7991 goto onError;
7992 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 Py_XDECREF(exc);
7994 Py_XDECREF(errorHandler);
7995 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007998 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 Py_XDECREF(exc);
8000 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 return NULL;
8002}
8003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008004/* Deprecated. Use PyUnicode_Translate instead. */
8005PyObject *
8006PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8007 Py_ssize_t size,
8008 PyObject *mapping,
8009 const char *errors)
8010{
8011 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8012 if (!unicode)
8013 return NULL;
8014 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8015}
8016
Alexander Belopolsky40018472011-02-26 01:02:56 +00008017PyObject *
8018PyUnicode_Translate(PyObject *str,
8019 PyObject *mapping,
8020 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021{
8022 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 str = PyUnicode_FromObject(str);
8025 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008027 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 Py_DECREF(str);
8029 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008030
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 Py_XDECREF(str);
8033 return NULL;
8034}
Tim Petersced69f82003-09-16 20:30:58 +00008035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008036static Py_UCS4
8037fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8038{
8039 /* No need to call PyUnicode_READY(self) because this function is only
8040 called as a callback from fixup() which does it already. */
8041 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8042 const int kind = PyUnicode_KIND(self);
8043 void *data = PyUnicode_DATA(self);
8044 Py_UCS4 maxchar = 0, ch, fixed;
8045 Py_ssize_t i;
8046
8047 for (i = 0; i < len; ++i) {
8048 ch = PyUnicode_READ(kind, data, i);
8049 fixed = 0;
8050 if (ch > 127) {
8051 if (Py_UNICODE_ISSPACE(ch))
8052 fixed = ' ';
8053 else {
8054 const int decimal = Py_UNICODE_TODECIMAL(ch);
8055 if (decimal >= 0)
8056 fixed = '0' + decimal;
8057 }
8058 if (fixed != 0) {
8059 if (fixed > maxchar)
8060 maxchar = fixed;
8061 PyUnicode_WRITE(kind, data, i, fixed);
8062 }
8063 else if (ch > maxchar)
8064 maxchar = ch;
8065 }
8066 else if (ch > maxchar)
8067 maxchar = ch;
8068 }
8069
8070 return maxchar;
8071}
8072
8073PyObject *
8074_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8075{
8076 if (!PyUnicode_Check(unicode)) {
8077 PyErr_BadInternalCall();
8078 return NULL;
8079 }
8080 if (PyUnicode_READY(unicode) == -1)
8081 return NULL;
8082 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8083 /* If the string is already ASCII, just return the same string */
8084 Py_INCREF(unicode);
8085 return unicode;
8086 }
8087 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8088}
8089
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008090PyObject *
8091PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8092 Py_ssize_t length)
8093{
8094 PyObject *result;
8095 Py_UNICODE *p; /* write pointer into result */
8096 Py_ssize_t i;
8097 /* Copy to a new string */
8098 result = (PyObject *)_PyUnicode_New(length);
8099 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8100 if (result == NULL)
8101 return result;
8102 p = PyUnicode_AS_UNICODE(result);
8103 /* Iterate over code points */
8104 for (i = 0; i < length; i++) {
8105 Py_UNICODE ch =s[i];
8106 if (ch > 127) {
8107 int decimal = Py_UNICODE_TODECIMAL(ch);
8108 if (decimal >= 0)
8109 p[i] = '0' + decimal;
8110 }
8111 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008112#ifndef DONT_MAKE_RESULT_READY
8113 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 Py_DECREF(result);
8115 return NULL;
8116 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008117#endif
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008118 return result;
8119}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008120/* --- Decimal Encoder ---------------------------------------------------- */
8121
Alexander Belopolsky40018472011-02-26 01:02:56 +00008122int
8123PyUnicode_EncodeDecimal(Py_UNICODE *s,
8124 Py_ssize_t length,
8125 char *output,
8126 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008127{
8128 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 PyObject *errorHandler = NULL;
8130 PyObject *exc = NULL;
8131 const char *encoding = "decimal";
8132 const char *reason = "invalid decimal Unicode string";
8133 /* the following variable is used for caching string comparisons
8134 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8135 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008136
8137 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 PyErr_BadArgument();
8139 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008140 }
8141
8142 p = s;
8143 end = s + length;
8144 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 register Py_UNICODE ch = *p;
8146 int decimal;
8147 PyObject *repunicode;
8148 Py_ssize_t repsize;
8149 Py_ssize_t newpos;
8150 Py_UNICODE *uni2;
8151 Py_UNICODE *collstart;
8152 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008153
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008155 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 ++p;
8157 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 decimal = Py_UNICODE_TODECIMAL(ch);
8160 if (decimal >= 0) {
8161 *output++ = '0' + decimal;
8162 ++p;
8163 continue;
8164 }
8165 if (0 < ch && ch < 256) {
8166 *output++ = (char)ch;
8167 ++p;
8168 continue;
8169 }
8170 /* All other characters are considered unencodable */
8171 collstart = p;
8172 collend = p+1;
8173 while (collend < end) {
8174 if ((0 < *collend && *collend < 256) ||
8175 !Py_UNICODE_ISSPACE(*collend) ||
8176 Py_UNICODE_TODECIMAL(*collend))
8177 break;
8178 }
8179 /* cache callback name lookup
8180 * (if not done yet, i.e. it's the first error) */
8181 if (known_errorHandler==-1) {
8182 if ((errors==NULL) || (!strcmp(errors, "strict")))
8183 known_errorHandler = 1;
8184 else if (!strcmp(errors, "replace"))
8185 known_errorHandler = 2;
8186 else if (!strcmp(errors, "ignore"))
8187 known_errorHandler = 3;
8188 else if (!strcmp(errors, "xmlcharrefreplace"))
8189 known_errorHandler = 4;
8190 else
8191 known_errorHandler = 0;
8192 }
8193 switch (known_errorHandler) {
8194 case 1: /* strict */
8195 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8196 goto onError;
8197 case 2: /* replace */
8198 for (p = collstart; p < collend; ++p)
8199 *output++ = '?';
8200 /* fall through */
8201 case 3: /* ignore */
8202 p = collend;
8203 break;
8204 case 4: /* xmlcharrefreplace */
8205 /* generate replacement (temporarily (mis)uses p) */
8206 for (p = collstart; p < collend; ++p)
8207 output += sprintf(output, "&#%d;", (int)*p);
8208 p = collend;
8209 break;
8210 default:
8211 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8212 encoding, reason, s, length, &exc,
8213 collstart-s, collend-s, &newpos);
8214 if (repunicode == NULL)
8215 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008216 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008217 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008218 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8219 Py_DECREF(repunicode);
8220 goto onError;
8221 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 /* generate replacement */
8223 repsize = PyUnicode_GET_SIZE(repunicode);
8224 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8225 Py_UNICODE ch = *uni2;
8226 if (Py_UNICODE_ISSPACE(ch))
8227 *output++ = ' ';
8228 else {
8229 decimal = Py_UNICODE_TODECIMAL(ch);
8230 if (decimal >= 0)
8231 *output++ = '0' + decimal;
8232 else if (0 < ch && ch < 256)
8233 *output++ = (char)ch;
8234 else {
8235 Py_DECREF(repunicode);
8236 raise_encode_exception(&exc, encoding,
8237 s, length, collstart-s, collend-s, reason);
8238 goto onError;
8239 }
8240 }
8241 }
8242 p = s + newpos;
8243 Py_DECREF(repunicode);
8244 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008245 }
8246 /* 0-terminate the output string */
8247 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 Py_XDECREF(exc);
8249 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008250 return 0;
8251
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 Py_XDECREF(exc);
8254 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008255 return -1;
8256}
8257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258/* --- Helpers ------------------------------------------------------------ */
8259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260#include "stringlib/ucs1lib.h"
8261#include "stringlib/fastsearch.h"
8262#include "stringlib/partition.h"
8263#include "stringlib/split.h"
8264#include "stringlib/count.h"
8265#include "stringlib/find.h"
8266#include "stringlib/localeutil.h"
8267#include "stringlib/undef.h"
8268
8269#include "stringlib/ucs2lib.h"
8270#include "stringlib/fastsearch.h"
8271#include "stringlib/partition.h"
8272#include "stringlib/split.h"
8273#include "stringlib/count.h"
8274#include "stringlib/find.h"
8275#include "stringlib/localeutil.h"
8276#include "stringlib/undef.h"
8277
8278#include "stringlib/ucs4lib.h"
8279#include "stringlib/fastsearch.h"
8280#include "stringlib/partition.h"
8281#include "stringlib/split.h"
8282#include "stringlib/count.h"
8283#include "stringlib/find.h"
8284#include "stringlib/localeutil.h"
8285#include "stringlib/undef.h"
8286
8287static Py_ssize_t
8288any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8289 const Py_UCS1*, Py_ssize_t,
8290 Py_ssize_t, Py_ssize_t),
8291 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8292 const Py_UCS2*, Py_ssize_t,
8293 Py_ssize_t, Py_ssize_t),
8294 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8295 const Py_UCS4*, Py_ssize_t,
8296 Py_ssize_t, Py_ssize_t),
8297 PyObject* s1, PyObject* s2,
8298 Py_ssize_t start,
8299 Py_ssize_t end)
8300{
8301 int kind1, kind2, kind;
8302 void *buf1, *buf2;
8303 Py_ssize_t len1, len2, result;
8304
8305 kind1 = PyUnicode_KIND(s1);
8306 kind2 = PyUnicode_KIND(s2);
8307 kind = kind1 > kind2 ? kind1 : kind2;
8308 buf1 = PyUnicode_DATA(s1);
8309 buf2 = PyUnicode_DATA(s2);
8310 if (kind1 != kind)
8311 buf1 = _PyUnicode_AsKind(s1, kind);
8312 if (!buf1)
8313 return -2;
8314 if (kind2 != kind)
8315 buf2 = _PyUnicode_AsKind(s2, kind);
8316 if (!buf2) {
8317 if (kind1 != kind) PyMem_Free(buf1);
8318 return -2;
8319 }
8320 len1 = PyUnicode_GET_LENGTH(s1);
8321 len2 = PyUnicode_GET_LENGTH(s2);
8322
8323 switch(kind) {
8324 case PyUnicode_1BYTE_KIND:
8325 result = ucs1(buf1, len1, buf2, len2, start, end);
8326 break;
8327 case PyUnicode_2BYTE_KIND:
8328 result = ucs2(buf1, len1, buf2, len2, start, end);
8329 break;
8330 case PyUnicode_4BYTE_KIND:
8331 result = ucs4(buf1, len1, buf2, len2, start, end);
8332 break;
8333 default:
8334 assert(0); result = -2;
8335 }
8336
8337 if (kind1 != kind)
8338 PyMem_Free(buf1);
8339 if (kind2 != kind)
8340 PyMem_Free(buf2);
8341
8342 return result;
8343}
8344
8345Py_ssize_t
8346_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8347 Py_ssize_t n_buffer,
8348 void *digits, Py_ssize_t n_digits,
8349 Py_ssize_t min_width,
8350 const char *grouping,
8351 const char *thousands_sep)
8352{
8353 switch(kind) {
8354 case PyUnicode_1BYTE_KIND:
8355 return _PyUnicode_ucs1_InsertThousandsGrouping(
8356 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8357 min_width, grouping, thousands_sep);
8358 case PyUnicode_2BYTE_KIND:
8359 return _PyUnicode_ucs2_InsertThousandsGrouping(
8360 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8361 min_width, grouping, thousands_sep);
8362 case PyUnicode_4BYTE_KIND:
8363 return _PyUnicode_ucs4_InsertThousandsGrouping(
8364 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8365 min_width, grouping, thousands_sep);
8366 }
8367 assert(0);
8368 return -1;
8369}
8370
8371
Eric Smith8c663262007-08-25 02:26:07 +00008372#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008373#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008374
Thomas Wouters477c8d52006-05-27 19:21:47 +00008375#include "stringlib/count.h"
8376#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008377
Thomas Wouters477c8d52006-05-27 19:21:47 +00008378/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008379#define ADJUST_INDICES(start, end, len) \
8380 if (end > len) \
8381 end = len; \
8382 else if (end < 0) { \
8383 end += len; \
8384 if (end < 0) \
8385 end = 0; \
8386 } \
8387 if (start < 0) { \
8388 start += len; \
8389 if (start < 0) \
8390 start = 0; \
8391 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008392
Alexander Belopolsky40018472011-02-26 01:02:56 +00008393Py_ssize_t
8394PyUnicode_Count(PyObject *str,
8395 PyObject *substr,
8396 Py_ssize_t start,
8397 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008400 PyUnicodeObject* str_obj;
8401 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 int kind1, kind2, kind;
8403 void *buf1 = NULL, *buf2 = NULL;
8404 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008405
Thomas Wouters477c8d52006-05-27 19:21:47 +00008406 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008409 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008410 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 Py_DECREF(str_obj);
8412 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 }
Tim Petersced69f82003-09-16 20:30:58 +00008414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 kind1 = PyUnicode_KIND(str_obj);
8416 kind2 = PyUnicode_KIND(sub_obj);
8417 kind = kind1 > kind2 ? kind1 : kind2;
8418 buf1 = PyUnicode_DATA(str_obj);
8419 if (kind1 != kind)
8420 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8421 if (!buf1)
8422 goto onError;
8423 buf2 = PyUnicode_DATA(sub_obj);
8424 if (kind2 != kind)
8425 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8426 if (!buf2)
8427 goto onError;
8428 len1 = PyUnicode_GET_LENGTH(str_obj);
8429 len2 = PyUnicode_GET_LENGTH(sub_obj);
8430
8431 ADJUST_INDICES(start, end, len1);
8432 switch(kind) {
8433 case PyUnicode_1BYTE_KIND:
8434 result = ucs1lib_count(
8435 ((Py_UCS1*)buf1) + start, end - start,
8436 buf2, len2, PY_SSIZE_T_MAX
8437 );
8438 break;
8439 case PyUnicode_2BYTE_KIND:
8440 result = ucs2lib_count(
8441 ((Py_UCS2*)buf1) + start, end - start,
8442 buf2, len2, PY_SSIZE_T_MAX
8443 );
8444 break;
8445 case PyUnicode_4BYTE_KIND:
8446 result = ucs4lib_count(
8447 ((Py_UCS4*)buf1) + start, end - start,
8448 buf2, len2, PY_SSIZE_T_MAX
8449 );
8450 break;
8451 default:
8452 assert(0); result = 0;
8453 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008454
8455 Py_DECREF(sub_obj);
8456 Py_DECREF(str_obj);
8457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 if (kind1 != kind)
8459 PyMem_Free(buf1);
8460 if (kind2 != kind)
8461 PyMem_Free(buf2);
8462
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 onError:
8465 Py_DECREF(sub_obj);
8466 Py_DECREF(str_obj);
8467 if (kind1 != kind && buf1)
8468 PyMem_Free(buf1);
8469 if (kind2 != kind && buf2)
8470 PyMem_Free(buf2);
8471 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472}
8473
Alexander Belopolsky40018472011-02-26 01:02:56 +00008474Py_ssize_t
8475PyUnicode_Find(PyObject *str,
8476 PyObject *sub,
8477 Py_ssize_t start,
8478 Py_ssize_t end,
8479 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008481 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008482
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008486 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008487 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 Py_DECREF(str);
8489 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008490 }
Tim Petersced69f82003-09-16 20:30:58 +00008491
Thomas Wouters477c8d52006-05-27 19:21:47 +00008492 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 result = any_find_slice(
8494 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8495 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008496 );
8497 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 result = any_find_slice(
8499 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8500 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008501 );
8502
Guido van Rossumd57fd912000-03-10 22:53:23 +00008503 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008504 Py_DECREF(sub);
8505
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 return result;
8507}
8508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509Py_ssize_t
8510PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8511 Py_ssize_t start, Py_ssize_t end,
8512 int direction)
8513{
8514 char *result;
8515 int kind;
8516 if (PyUnicode_READY(str) == -1)
8517 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008518 if (start < 0 || end < 0) {
8519 PyErr_SetString(PyExc_IndexError, "string index out of range");
8520 return -2;
8521 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 if (end > PyUnicode_GET_LENGTH(str))
8523 end = PyUnicode_GET_LENGTH(str);
8524 kind = PyUnicode_KIND(str);
8525 result = findchar(PyUnicode_1BYTE_DATA(str)
8526 + PyUnicode_KIND_SIZE(kind, start),
8527 kind,
8528 end-start, ch, direction);
8529 if (!result)
8530 return -1;
8531 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8532}
8533
Alexander Belopolsky40018472011-02-26 01:02:56 +00008534static int
8535tailmatch(PyUnicodeObject *self,
8536 PyUnicodeObject *substring,
8537 Py_ssize_t start,
8538 Py_ssize_t end,
8539 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 int kind_self;
8542 int kind_sub;
8543 void *data_self;
8544 void *data_sub;
8545 Py_ssize_t offset;
8546 Py_ssize_t i;
8547 Py_ssize_t end_sub;
8548
8549 if (PyUnicode_READY(self) == -1 ||
8550 PyUnicode_READY(substring) == -1)
8551 return 0;
8552
8553 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 return 1;
8555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8557 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 kind_self = PyUnicode_KIND(self);
8562 data_self = PyUnicode_DATA(self);
8563 kind_sub = PyUnicode_KIND(substring);
8564 data_sub = PyUnicode_DATA(substring);
8565 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8566
8567 if (direction > 0)
8568 offset = end;
8569 else
8570 offset = start;
8571
8572 if (PyUnicode_READ(kind_self, data_self, offset) ==
8573 PyUnicode_READ(kind_sub, data_sub, 0) &&
8574 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8575 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8576 /* If both are of the same kind, memcmp is sufficient */
8577 if (kind_self == kind_sub) {
8578 return ! memcmp((char *)data_self +
8579 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8580 data_sub,
8581 PyUnicode_GET_LENGTH(substring) *
8582 PyUnicode_CHARACTER_SIZE(substring));
8583 }
8584 /* otherwise we have to compare each character by first accesing it */
8585 else {
8586 /* We do not need to compare 0 and len(substring)-1 because
8587 the if statement above ensured already that they are equal
8588 when we end up here. */
8589 // TODO: honor direction and do a forward or backwards search
8590 for (i = 1; i < end_sub; ++i) {
8591 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8592 PyUnicode_READ(kind_sub, data_sub, i))
8593 return 0;
8594 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 }
8598
8599 return 0;
8600}
8601
Alexander Belopolsky40018472011-02-26 01:02:56 +00008602Py_ssize_t
8603PyUnicode_Tailmatch(PyObject *str,
8604 PyObject *substr,
8605 Py_ssize_t start,
8606 Py_ssize_t end,
8607 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008609 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008610
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 str = PyUnicode_FromObject(str);
8612 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 substr = PyUnicode_FromObject(substr);
8615 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 Py_DECREF(str);
8617 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Tim Petersced69f82003-09-16 20:30:58 +00008619
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 (PyUnicodeObject *)substr,
8622 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 Py_DECREF(str);
8624 Py_DECREF(substr);
8625 return result;
8626}
8627
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628/* Apply fixfct filter to the Unicode object self and return a
8629 reference to the modified object */
8630
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631static PyObject *
8632fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 PyObject *u;
8636 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 if (PyUnicode_READY(self) == -1)
8639 return NULL;
8640 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8641 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8642 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008645
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8647 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 /* fix functions return the new maximum character in a string,
8650 if the kind of the resulting unicode object does not change,
8651 everything is fine. Otherwise we need to change the string kind
8652 and re-run the fix function. */
8653 maxchar_new = fixfct((PyUnicodeObject*)u);
8654 if (maxchar_new == 0)
8655 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8656 else if (maxchar_new <= 127)
8657 maxchar_new = 127;
8658 else if (maxchar_new <= 255)
8659 maxchar_new = 255;
8660 else if (maxchar_new <= 65535)
8661 maxchar_new = 65535;
8662 else
8663 maxchar_new = 1114111; /* 0x10ffff */
8664
8665 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 /* fixfct should return TRUE if it modified the buffer. If
8667 FALSE, return a reference to the original buffer instead
8668 (to save space, not time) */
8669 Py_INCREF(self);
8670 Py_DECREF(u);
8671 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 else if (maxchar_new == maxchar_old) {
8674 return u;
8675 }
8676 else {
8677 /* In case the maximum character changed, we need to
8678 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008679 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 if (v == NULL) {
8681 Py_DECREF(u);
8682 return NULL;
8683 }
8684 if (maxchar_new > maxchar_old) {
8685 /* If the maxchar increased so that the kind changed, not all
8686 characters are representable anymore and we need to fix the
8687 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008688 if (PyUnicode_CopyCharacters(v, 0,
8689 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008690 PyUnicode_GET_LENGTH(self)) < 0)
8691 {
8692 Py_DECREF(u);
8693 return NULL;
8694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 maxchar_old = fixfct((PyUnicodeObject*)v);
8696 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8697 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008698 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008699 if (PyUnicode_CopyCharacters(v, 0,
8700 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008701 PyUnicode_GET_LENGTH(self)) < 0)
8702 {
8703 Py_DECREF(u);
8704 return NULL;
8705 }
8706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707
8708 Py_DECREF(u);
8709 return v;
8710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711}
8712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 /* No need to call PyUnicode_READY(self) because this function is only
8717 called as a callback from fixup() which does it already. */
8718 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8719 const int kind = PyUnicode_KIND(self);
8720 void *data = PyUnicode_DATA(self);
8721 int touched = 0;
8722 Py_UCS4 maxchar = 0;
8723 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 for (i = 0; i < len; ++i) {
8726 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8727 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8728 if (up != ch) {
8729 if (up > maxchar)
8730 maxchar = up;
8731 PyUnicode_WRITE(kind, data, i, up);
8732 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 else if (ch > maxchar)
8735 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 }
8737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 if (touched)
8739 return maxchar;
8740 else
8741 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742}
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008746{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8748 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8749 const int kind = PyUnicode_KIND(self);
8750 void *data = PyUnicode_DATA(self);
8751 int touched = 0;
8752 Py_UCS4 maxchar = 0;
8753 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 for(i = 0; i < len; ++i) {
8756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8757 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8758 if (lo != ch) {
8759 if (lo > maxchar)
8760 maxchar = lo;
8761 PyUnicode_WRITE(kind, data, i, lo);
8762 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 else if (ch > maxchar)
8765 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 }
8767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 if (touched)
8769 return maxchar;
8770 else
8771 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772}
8773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008775fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8778 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8779 const int kind = PyUnicode_KIND(self);
8780 void *data = PyUnicode_DATA(self);
8781 int touched = 0;
8782 Py_UCS4 maxchar = 0;
8783 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 for(i = 0; i < len; ++i) {
8786 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8787 Py_UCS4 nu = 0;
8788
8789 if (Py_UNICODE_ISUPPER(ch))
8790 nu = Py_UNICODE_TOLOWER(ch);
8791 else if (Py_UNICODE_ISLOWER(ch))
8792 nu = Py_UNICODE_TOUPPER(ch);
8793
8794 if (nu != 0) {
8795 if (nu > maxchar)
8796 maxchar = nu;
8797 PyUnicode_WRITE(kind, data, i, nu);
8798 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 else if (ch > maxchar)
8801 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 }
8803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 if (touched)
8805 return maxchar;
8806 else
8807 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808}
8809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008811fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8814 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8815 const int kind = PyUnicode_KIND(self);
8816 void *data = PyUnicode_DATA(self);
8817 int touched = 0;
8818 Py_UCS4 maxchar = 0;
8819 Py_ssize_t i = 0;
8820 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008821
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008822 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824
8825 ch = PyUnicode_READ(kind, data, i);
8826 if (!Py_UNICODE_ISUPPER(ch)) {
8827 maxchar = Py_UNICODE_TOUPPER(ch);
8828 PyUnicode_WRITE(kind, data, i, maxchar);
8829 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 ++i;
8832 for(; i < len; ++i) {
8833 ch = PyUnicode_READ(kind, data, i);
8834 if (!Py_UNICODE_ISLOWER(ch)) {
8835 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8836 if (lo > maxchar)
8837 maxchar = lo;
8838 PyUnicode_WRITE(kind, data, i, lo);
8839 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 else if (ch > maxchar)
8842 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844
8845 if (touched)
8846 return maxchar;
8847 else
8848 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849}
8850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008852fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8855 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8856 const int kind = PyUnicode_KIND(self);
8857 void *data = PyUnicode_DATA(self);
8858 Py_UCS4 maxchar = 0;
8859 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860 int previous_is_cased;
8861
8862 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 if (len == 1) {
8864 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8865 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8866 if (ti != ch) {
8867 PyUnicode_WRITE(kind, data, i, ti);
8868 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 }
8870 else
8871 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 for(; i < len; ++i) {
8875 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8876 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008877
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 nu = Py_UNICODE_TOTITLE(ch);
8882
8883 if (nu > maxchar)
8884 maxchar = nu;
8885 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008886
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 if (Py_UNICODE_ISLOWER(ch) ||
8888 Py_UNICODE_ISUPPER(ch) ||
8889 Py_UNICODE_ISTITLE(ch))
8890 previous_is_cased = 1;
8891 else
8892 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895}
8896
Tim Peters8ce9f162004-08-27 01:49:32 +00008897PyObject *
8898PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008901 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008903 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008904 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8905 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008906 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008907 Py_ssize_t sz, i, res_offset;
8908 Py_UCS4 maxchar = 0;
8909 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910
Tim Peters05eba1f2004-08-27 21:32:02 +00008911 fseq = PySequence_Fast(seq, "");
8912 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008914 }
8915
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008916 /* NOTE: the following code can't call back into Python code,
8917 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008918 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008919
Tim Peters05eba1f2004-08-27 21:32:02 +00008920 seqlen = PySequence_Fast_GET_SIZE(fseq);
8921 /* If empty sequence, return u"". */
8922 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008923 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008924 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008925 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008926 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008927 /* If singleton sequence with an exact Unicode, return that. */
8928 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008929 item = items[0];
8930 if (PyUnicode_CheckExact(item)) {
8931 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 goto Done;
8934 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008935 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008936 else {
8937 /* Set up sep and seplen */
8938 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 /* fall back to a blank space separator */
8940 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008941 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008943 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008944 else {
8945 if (!PyUnicode_Check(separator)) {
8946 PyErr_Format(PyExc_TypeError,
8947 "separator: expected str instance,"
8948 " %.80s found",
8949 Py_TYPE(separator)->tp_name);
8950 goto onError;
8951 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008952 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 goto onError;
8954 sep = separator;
8955 seplen = PyUnicode_GET_LENGTH(separator);
8956 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8957 /* inc refcount to keep this code path symetric with the
8958 above case of a blank separator */
8959 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008960 }
8961 }
8962
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008963 /* There are at least two things to join, or else we have a subclass
8964 * of str in the sequence.
8965 * Do a pre-pass to figure out the total amount of space we'll
8966 * need (sz), and see whether all argument are strings.
8967 */
8968 sz = 0;
8969 for (i = 0; i < seqlen; i++) {
8970 const Py_ssize_t old_sz = sz;
8971 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 if (!PyUnicode_Check(item)) {
8973 PyErr_Format(PyExc_TypeError,
8974 "sequence item %zd: expected str instance,"
8975 " %.80s found",
8976 i, Py_TYPE(item)->tp_name);
8977 goto onError;
8978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 if (PyUnicode_READY(item) == -1)
8980 goto onError;
8981 sz += PyUnicode_GET_LENGTH(item);
8982 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8983 if (item_maxchar > maxchar)
8984 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008985 if (i != 0)
8986 sz += seplen;
8987 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8988 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008990 goto onError;
8991 }
8992 }
Tim Petersced69f82003-09-16 20:30:58 +00008993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008994 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008995 if (res == NULL)
8996 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008997
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008998 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009000 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009001 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009003 if (i && seplen != 0) {
9004 copied = PyUnicode_CopyCharacters(res, res_offset,
9005 sep, 0, seplen);
9006 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009007 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009008#ifdef Py_DEBUG
9009 res_offset += copied;
9010#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009012#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009014 itemlen = PyUnicode_GET_LENGTH(item);
9015 if (itemlen != 0) {
9016 copied = PyUnicode_CopyCharacters(res, res_offset,
9017 item, 0, itemlen);
9018 if (copied < 0)
9019 goto onError;
9020#ifdef Py_DEBUG
9021 res_offset += copied;
9022#else
9023 res_offset += itemlen;
9024#endif
9025 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009028
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009030 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031 Py_XDECREF(sep);
9032 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009035 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009037 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 return NULL;
9039}
9040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041#define FILL(kind, data, value, start, length) \
9042 do { \
9043 Py_ssize_t i_ = 0; \
9044 assert(kind != PyUnicode_WCHAR_KIND); \
9045 switch ((kind)) { \
9046 case PyUnicode_1BYTE_KIND: { \
9047 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9048 memset(to_, (unsigned char)value, length); \
9049 break; \
9050 } \
9051 case PyUnicode_2BYTE_KIND: { \
9052 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9053 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9054 break; \
9055 } \
9056 default: { \
9057 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9058 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9059 break; \
9060 } \
9061 } \
9062 } while (0)
9063
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064static PyUnicodeObject *
9065pad(PyUnicodeObject *self,
9066 Py_ssize_t left,
9067 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 PyObject *u;
9071 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009072 int kind;
9073 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074
9075 if (left < 0)
9076 left = 0;
9077 if (right < 0)
9078 right = 0;
9079
Tim Peters7a29bd52001-09-12 03:03:31 +00009080 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 Py_INCREF(self);
9082 return self;
9083 }
9084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9086 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009087 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9088 return NULL;
9089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9091 if (fill > maxchar)
9092 maxchar = fill;
9093 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009094 if (!u)
9095 return NULL;
9096
9097 kind = PyUnicode_KIND(u);
9098 data = PyUnicode_DATA(u);
9099 if (left)
9100 FILL(kind, data, fill, 0, left);
9101 if (right)
9102 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009103 if (PyUnicode_CopyCharacters(u, left,
9104 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009105 _PyUnicode_LENGTH(self)) < 0)
9106 {
9107 Py_DECREF(u);
9108 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 }
9110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009114
Alexander Belopolsky40018472011-02-26 01:02:56 +00009115PyObject *
9116PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119
9120 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 switch(PyUnicode_KIND(string)) {
9125 case PyUnicode_1BYTE_KIND:
9126 list = ucs1lib_splitlines(
9127 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9128 PyUnicode_GET_LENGTH(string), keepends);
9129 break;
9130 case PyUnicode_2BYTE_KIND:
9131 list = ucs2lib_splitlines(
9132 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9133 PyUnicode_GET_LENGTH(string), keepends);
9134 break;
9135 case PyUnicode_4BYTE_KIND:
9136 list = ucs4lib_splitlines(
9137 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9138 PyUnicode_GET_LENGTH(string), keepends);
9139 break;
9140 default:
9141 assert(0);
9142 list = 0;
9143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 Py_DECREF(string);
9145 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146}
9147
Alexander Belopolsky40018472011-02-26 01:02:56 +00009148static PyObject *
9149split(PyUnicodeObject *self,
9150 PyUnicodeObject *substring,
9151 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 int kind1, kind2, kind;
9154 void *buf1, *buf2;
9155 Py_ssize_t len1, len2;
9156 PyObject* out;
9157
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009159 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 if (PyUnicode_READY(self) == -1)
9162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 if (substring == NULL)
9165 switch(PyUnicode_KIND(self)) {
9166 case PyUnicode_1BYTE_KIND:
9167 return ucs1lib_split_whitespace(
9168 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9169 PyUnicode_GET_LENGTH(self), maxcount
9170 );
9171 case PyUnicode_2BYTE_KIND:
9172 return ucs2lib_split_whitespace(
9173 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9174 PyUnicode_GET_LENGTH(self), maxcount
9175 );
9176 case PyUnicode_4BYTE_KIND:
9177 return ucs4lib_split_whitespace(
9178 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9179 PyUnicode_GET_LENGTH(self), maxcount
9180 );
9181 default:
9182 assert(0);
9183 return NULL;
9184 }
9185
9186 if (PyUnicode_READY(substring) == -1)
9187 return NULL;
9188
9189 kind1 = PyUnicode_KIND(self);
9190 kind2 = PyUnicode_KIND(substring);
9191 kind = kind1 > kind2 ? kind1 : kind2;
9192 buf1 = PyUnicode_DATA(self);
9193 buf2 = PyUnicode_DATA(substring);
9194 if (kind1 != kind)
9195 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9196 if (!buf1)
9197 return NULL;
9198 if (kind2 != kind)
9199 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9200 if (!buf2) {
9201 if (kind1 != kind) PyMem_Free(buf1);
9202 return NULL;
9203 }
9204 len1 = PyUnicode_GET_LENGTH(self);
9205 len2 = PyUnicode_GET_LENGTH(substring);
9206
9207 switch(kind) {
9208 case PyUnicode_1BYTE_KIND:
9209 out = ucs1lib_split(
9210 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9211 break;
9212 case PyUnicode_2BYTE_KIND:
9213 out = ucs2lib_split(
9214 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9215 break;
9216 case PyUnicode_4BYTE_KIND:
9217 out = ucs4lib_split(
9218 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9219 break;
9220 default:
9221 out = NULL;
9222 }
9223 if (kind1 != kind)
9224 PyMem_Free(buf1);
9225 if (kind2 != kind)
9226 PyMem_Free(buf2);
9227 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228}
9229
Alexander Belopolsky40018472011-02-26 01:02:56 +00009230static PyObject *
9231rsplit(PyUnicodeObject *self,
9232 PyUnicodeObject *substring,
9233 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235 int kind1, kind2, kind;
9236 void *buf1, *buf2;
9237 Py_ssize_t len1, len2;
9238 PyObject* out;
9239
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009240 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009241 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 if (PyUnicode_READY(self) == -1)
9244 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 if (substring == NULL)
9247 switch(PyUnicode_KIND(self)) {
9248 case PyUnicode_1BYTE_KIND:
9249 return ucs1lib_rsplit_whitespace(
9250 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9251 PyUnicode_GET_LENGTH(self), maxcount
9252 );
9253 case PyUnicode_2BYTE_KIND:
9254 return ucs2lib_rsplit_whitespace(
9255 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9256 PyUnicode_GET_LENGTH(self), maxcount
9257 );
9258 case PyUnicode_4BYTE_KIND:
9259 return ucs4lib_rsplit_whitespace(
9260 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9261 PyUnicode_GET_LENGTH(self), maxcount
9262 );
9263 default:
9264 assert(0);
9265 return NULL;
9266 }
9267
9268 if (PyUnicode_READY(substring) == -1)
9269 return NULL;
9270
9271 kind1 = PyUnicode_KIND(self);
9272 kind2 = PyUnicode_KIND(substring);
9273 kind = kind1 > kind2 ? kind1 : kind2;
9274 buf1 = PyUnicode_DATA(self);
9275 buf2 = PyUnicode_DATA(substring);
9276 if (kind1 != kind)
9277 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9278 if (!buf1)
9279 return NULL;
9280 if (kind2 != kind)
9281 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9282 if (!buf2) {
9283 if (kind1 != kind) PyMem_Free(buf1);
9284 return NULL;
9285 }
9286 len1 = PyUnicode_GET_LENGTH(self);
9287 len2 = PyUnicode_GET_LENGTH(substring);
9288
9289 switch(kind) {
9290 case PyUnicode_1BYTE_KIND:
9291 out = ucs1lib_rsplit(
9292 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9293 break;
9294 case PyUnicode_2BYTE_KIND:
9295 out = ucs2lib_rsplit(
9296 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9297 break;
9298 case PyUnicode_4BYTE_KIND:
9299 out = ucs4lib_rsplit(
9300 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9301 break;
9302 default:
9303 out = NULL;
9304 }
9305 if (kind1 != kind)
9306 PyMem_Free(buf1);
9307 if (kind2 != kind)
9308 PyMem_Free(buf2);
9309 return out;
9310}
9311
9312static Py_ssize_t
9313anylib_find(int kind, void *buf1, Py_ssize_t len1,
9314 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9315{
9316 switch(kind) {
9317 case PyUnicode_1BYTE_KIND:
9318 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9319 case PyUnicode_2BYTE_KIND:
9320 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9321 case PyUnicode_4BYTE_KIND:
9322 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9323 }
9324 assert(0);
9325 return -1;
9326}
9327
9328static Py_ssize_t
9329anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9330 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9331{
9332 switch(kind) {
9333 case PyUnicode_1BYTE_KIND:
9334 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9335 case PyUnicode_2BYTE_KIND:
9336 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9337 case PyUnicode_4BYTE_KIND:
9338 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9339 }
9340 assert(0);
9341 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009342}
9343
Alexander Belopolsky40018472011-02-26 01:02:56 +00009344static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345replace(PyObject *self, PyObject *str1,
9346 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 PyObject *u;
9349 char *sbuf = PyUnicode_DATA(self);
9350 char *buf1 = PyUnicode_DATA(str1);
9351 char *buf2 = PyUnicode_DATA(str2);
9352 int srelease = 0, release1 = 0, release2 = 0;
9353 int skind = PyUnicode_KIND(self);
9354 int kind1 = PyUnicode_KIND(str1);
9355 int kind2 = PyUnicode_KIND(str2);
9356 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9357 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9358 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359
9360 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009363 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 if (skind < kind1)
9366 /* substring too wide to be present */
9367 goto nothing;
9368
9369 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009370 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009371 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009373 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009375 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 Py_UCS4 u1, u2, maxchar;
9377 int mayshrink, rkind;
9378 u1 = PyUnicode_READ_CHAR(str1, 0);
9379 if (!findchar(sbuf, PyUnicode_KIND(self),
9380 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009381 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 u2 = PyUnicode_READ_CHAR(str2, 0);
9383 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9384 /* Replacing u1 with u2 may cause a maxchar reduction in the
9385 result string. */
9386 mayshrink = maxchar > 127;
9387 if (u2 > maxchar) {
9388 maxchar = u2;
9389 mayshrink = 0;
9390 }
9391 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009392 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009394 if (PyUnicode_CopyCharacters(u, 0,
9395 (PyObject*)self, 0, slen) < 0)
9396 {
9397 Py_DECREF(u);
9398 return NULL;
9399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 rkind = PyUnicode_KIND(u);
9401 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9402 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009403 if (--maxcount < 0)
9404 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 if (mayshrink) {
9408 PyObject *tmp = u;
9409 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9410 PyUnicode_GET_LENGTH(tmp));
9411 Py_DECREF(tmp);
9412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 int rkind = skind;
9415 char *res;
9416 if (kind1 < rkind) {
9417 /* widen substring */
9418 buf1 = _PyUnicode_AsKind(str1, rkind);
9419 if (!buf1) goto error;
9420 release1 = 1;
9421 }
9422 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009423 if (i < 0)
9424 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 if (rkind > kind2) {
9426 /* widen replacement */
9427 buf2 = _PyUnicode_AsKind(str2, rkind);
9428 if (!buf2) goto error;
9429 release2 = 1;
9430 }
9431 else if (rkind < kind2) {
9432 /* widen self and buf1 */
9433 rkind = kind2;
9434 if (release1) PyMem_Free(buf1);
9435 sbuf = _PyUnicode_AsKind(self, rkind);
9436 if (!sbuf) goto error;
9437 srelease = 1;
9438 buf1 = _PyUnicode_AsKind(str1, rkind);
9439 if (!buf1) goto error;
9440 release1 = 1;
9441 }
9442 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9443 if (!res) {
9444 PyErr_NoMemory();
9445 goto error;
9446 }
9447 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009448 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9450 buf2,
9451 PyUnicode_KIND_SIZE(rkind, len2));
9452 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009453
9454 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9456 slen-i,
9457 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009458 if (i == -1)
9459 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9461 buf2,
9462 PyUnicode_KIND_SIZE(rkind, len2));
9463 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465
9466 u = PyUnicode_FromKindAndData(rkind, res, slen);
9467 PyMem_Free(res);
9468 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 Py_ssize_t n, i, j, ires;
9473 Py_ssize_t product, new_size;
9474 int rkind = skind;
9475 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 if (kind1 < rkind) {
9478 buf1 = _PyUnicode_AsKind(str1, rkind);
9479 if (!buf1) goto error;
9480 release1 = 1;
9481 }
9482 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009483 if (n == 0)
9484 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (kind2 < rkind) {
9486 buf2 = _PyUnicode_AsKind(str2, rkind);
9487 if (!buf2) goto error;
9488 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 else if (kind2 > rkind) {
9491 rkind = kind2;
9492 sbuf = _PyUnicode_AsKind(self, rkind);
9493 if (!sbuf) goto error;
9494 srelease = 1;
9495 if (release1) PyMem_Free(buf1);
9496 buf1 = _PyUnicode_AsKind(str1, rkind);
9497 if (!buf1) goto error;
9498 release1 = 1;
9499 }
9500 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9501 PyUnicode_GET_LENGTH(str1))); */
9502 product = n * (len2-len1);
9503 if ((product / (len2-len1)) != n) {
9504 PyErr_SetString(PyExc_OverflowError,
9505 "replace string is too long");
9506 goto error;
9507 }
9508 new_size = slen + product;
9509 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9510 PyErr_SetString(PyExc_OverflowError,
9511 "replace string is too long");
9512 goto error;
9513 }
9514 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9515 if (!res)
9516 goto error;
9517 ires = i = 0;
9518 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009519 while (n-- > 0) {
9520 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 j = anylib_find(rkind,
9522 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9523 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009524 if (j == -1)
9525 break;
9526 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009527 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9529 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9530 PyUnicode_KIND_SIZE(rkind, j-i));
9531 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009532 }
9533 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 if (len2 > 0) {
9535 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9536 buf2,
9537 PyUnicode_KIND_SIZE(rkind, len2));
9538 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009543 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9545 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9546 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009547 } else {
9548 /* interleave */
9549 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9551 buf2,
9552 PyUnicode_KIND_SIZE(rkind, len2));
9553 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009554 if (--n <= 0)
9555 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9557 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9558 PyUnicode_KIND_SIZE(rkind, 1));
9559 ires++;
9560 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009561 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9563 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9564 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009567 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 if (srelease)
9570 PyMem_FREE(sbuf);
9571 if (release1)
9572 PyMem_FREE(buf1);
9573 if (release2)
9574 PyMem_FREE(buf2);
9575 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009576
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009578 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (srelease)
9580 PyMem_FREE(sbuf);
9581 if (release1)
9582 PyMem_FREE(buf1);
9583 if (release2)
9584 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009585 if (PyUnicode_CheckExact(self)) {
9586 Py_INCREF(self);
9587 return (PyObject *) self;
9588 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009589 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 error:
9591 if (srelease && sbuf)
9592 PyMem_FREE(sbuf);
9593 if (release1 && buf1)
9594 PyMem_FREE(buf1);
9595 if (release2 && buf2)
9596 PyMem_FREE(buf2);
9597 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009598}
9599
9600/* --- Unicode Object Methods --------------------------------------------- */
9601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009602PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604\n\
9605Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009606characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607
9608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009609unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 return fixup(self, fixtitle);
9612}
9613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009614PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616\n\
9617Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009618have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619
9620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009621unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 return fixup(self, fixcapitalize);
9624}
9625
9626#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009627PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009629\n\
9630Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009631normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
9633static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009634unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635{
9636 PyObject *list;
9637 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009638 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 /* Split into words */
9641 list = split(self, NULL, -1);
9642 if (!list)
9643 return NULL;
9644
9645 /* Capitalize each word */
9646 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9647 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 if (item == NULL)
9650 goto onError;
9651 Py_DECREF(PyList_GET_ITEM(list, i));
9652 PyList_SET_ITEM(list, i, item);
9653 }
9654
9655 /* Join the words to form a new string */
9656 item = PyUnicode_Join(NULL, list);
9657
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659 Py_DECREF(list);
9660 return (PyObject *)item;
9661}
9662#endif
9663
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009664/* Argument converter. Coerces to a single unicode character */
9665
9666static int
9667convert_uc(PyObject *obj, void *addr)
9668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009670 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009671
Benjamin Peterson14339b62009-01-31 16:36:08 +00009672 uniobj = PyUnicode_FromObject(obj);
9673 if (uniobj == NULL) {
9674 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009675 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009676 return 0;
9677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009679 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009681 Py_DECREF(uniobj);
9682 return 0;
9683 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009685 Py_DECREF(uniobj);
9686 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009687}
9688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009689PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009690 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009692Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009693done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
9695static PyObject *
9696unicode_center(PyUnicodeObject *self, PyObject *args)
9697{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009698 Py_ssize_t marg, left;
9699 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 Py_UCS4 fillchar = ' ';
9701
Victor Stinnere9a29352011-10-01 02:14:59 +02009702 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009704
Victor Stinnere9a29352011-10-01 02:14:59 +02009705 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706 return NULL;
9707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709 Py_INCREF(self);
9710 return (PyObject*) self;
9711 }
9712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714 left = marg / 2 + (marg & width & 1);
9715
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009716 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717}
9718
Marc-André Lemburge5034372000-08-08 08:04:29 +00009719#if 0
9720
9721/* This code should go into some future Unicode collation support
9722 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009723 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009724
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009725/* speedy UTF-16 code point order comparison */
9726/* gleaned from: */
9727/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9728
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009729static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009730{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009731 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009732 0, 0, 0, 0, 0, 0, 0, 0,
9733 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009734 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009735};
9736
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737static int
9738unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9739{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009740 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009741
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 Py_UNICODE *s1 = str1->str;
9743 Py_UNICODE *s2 = str2->str;
9744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 len1 = str1->_base._base.length;
9746 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009747
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009749 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009750
9751 c1 = *s1++;
9752 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009753
Benjamin Peterson29060642009-01-31 22:14:21 +00009754 if (c1 > (1<<11) * 26)
9755 c1 += utf16Fixup[c1>>11];
9756 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009757 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009758 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009759
9760 if (c1 != c2)
9761 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009762
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009763 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 }
9765
9766 return (len1 < len2) ? -1 : (len1 != len2);
9767}
9768
Marc-André Lemburge5034372000-08-08 08:04:29 +00009769#else
9770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771/* This function assumes that str1 and str2 are readied by the caller. */
9772
Marc-André Lemburge5034372000-08-08 08:04:29 +00009773static int
9774unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 int kind1, kind2;
9777 void *data1, *data2;
9778 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 kind1 = PyUnicode_KIND(str1);
9781 kind2 = PyUnicode_KIND(str2);
9782 data1 = PyUnicode_DATA(str1);
9783 data2 = PyUnicode_DATA(str2);
9784 len1 = PyUnicode_GET_LENGTH(str1);
9785 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 for (i = 0; i < len1 && i < len2; ++i) {
9788 Py_UCS4 c1, c2;
9789 c1 = PyUnicode_READ(kind1, data1, i);
9790 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009791
9792 if (c1 != c2)
9793 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009794 }
9795
9796 return (len1 < len2) ? -1 : (len1 != len2);
9797}
9798
9799#endif
9800
Alexander Belopolsky40018472011-02-26 01:02:56 +00009801int
9802PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9805 if (PyUnicode_READY(left) == -1 ||
9806 PyUnicode_READY(right) == -1)
9807 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009808 return unicode_compare((PyUnicodeObject *)left,
9809 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009810 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009811 PyErr_Format(PyExc_TypeError,
9812 "Can't compare %.100s and %.100s",
9813 left->ob_type->tp_name,
9814 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 return -1;
9816}
9817
Martin v. Löwis5b222132007-06-10 09:51:05 +00009818int
9819PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 Py_ssize_t i;
9822 int kind;
9823 void *data;
9824 Py_UCS4 chr;
9825
Victor Stinner910337b2011-10-03 03:20:16 +02009826 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 if (PyUnicode_READY(uni) == -1)
9828 return -1;
9829 kind = PyUnicode_KIND(uni);
9830 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009831 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9833 if (chr != str[i])
9834 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009835 /* This check keeps Python strings that end in '\0' from comparing equal
9836 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009839 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009841 return 0;
9842}
9843
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009844
Benjamin Peterson29060642009-01-31 22:14:21 +00009845#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009846 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009847
Alexander Belopolsky40018472011-02-26 01:02:56 +00009848PyObject *
9849PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009850{
9851 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009852
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009853 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9854 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (PyUnicode_READY(left) == -1 ||
9856 PyUnicode_READY(right) == -1)
9857 return NULL;
9858 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9859 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009860 if (op == Py_EQ) {
9861 Py_INCREF(Py_False);
9862 return Py_False;
9863 }
9864 if (op == Py_NE) {
9865 Py_INCREF(Py_True);
9866 return Py_True;
9867 }
9868 }
9869 if (left == right)
9870 result = 0;
9871 else
9872 result = unicode_compare((PyUnicodeObject *)left,
9873 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009874
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009875 /* Convert the return value to a Boolean */
9876 switch (op) {
9877 case Py_EQ:
9878 v = TEST_COND(result == 0);
9879 break;
9880 case Py_NE:
9881 v = TEST_COND(result != 0);
9882 break;
9883 case Py_LE:
9884 v = TEST_COND(result <= 0);
9885 break;
9886 case Py_GE:
9887 v = TEST_COND(result >= 0);
9888 break;
9889 case Py_LT:
9890 v = TEST_COND(result == -1);
9891 break;
9892 case Py_GT:
9893 v = TEST_COND(result == 1);
9894 break;
9895 default:
9896 PyErr_BadArgument();
9897 return NULL;
9898 }
9899 Py_INCREF(v);
9900 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009902
Brian Curtindfc80e32011-08-10 20:28:54 -05009903 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009904}
9905
Alexander Belopolsky40018472011-02-26 01:02:56 +00009906int
9907PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009908{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009909 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 int kind1, kind2, kind;
9911 void *buf1, *buf2;
9912 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009913 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009914
9915 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009916 sub = PyUnicode_FromObject(element);
9917 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 PyErr_Format(PyExc_TypeError,
9919 "'in <string>' requires string as left operand, not %s",
9920 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009921 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 if (PyUnicode_READY(sub) == -1)
9924 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009925
Thomas Wouters477c8d52006-05-27 19:21:47 +00009926 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009927 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009928 Py_DECREF(sub);
9929 return -1;
9930 }
9931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 kind1 = PyUnicode_KIND(str);
9933 kind2 = PyUnicode_KIND(sub);
9934 kind = kind1 > kind2 ? kind1 : kind2;
9935 buf1 = PyUnicode_DATA(str);
9936 buf2 = PyUnicode_DATA(sub);
9937 if (kind1 != kind)
9938 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9939 if (!buf1) {
9940 Py_DECREF(sub);
9941 return -1;
9942 }
9943 if (kind2 != kind)
9944 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9945 if (!buf2) {
9946 Py_DECREF(sub);
9947 if (kind1 != kind) PyMem_Free(buf1);
9948 return -1;
9949 }
9950 len1 = PyUnicode_GET_LENGTH(str);
9951 len2 = PyUnicode_GET_LENGTH(sub);
9952
9953 switch(kind) {
9954 case PyUnicode_1BYTE_KIND:
9955 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9956 break;
9957 case PyUnicode_2BYTE_KIND:
9958 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9959 break;
9960 case PyUnicode_4BYTE_KIND:
9961 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9962 break;
9963 default:
9964 result = -1;
9965 assert(0);
9966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009967
9968 Py_DECREF(str);
9969 Py_DECREF(sub);
9970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (kind1 != kind)
9972 PyMem_Free(buf1);
9973 if (kind2 != kind)
9974 PyMem_Free(buf2);
9975
Guido van Rossum403d68b2000-03-13 15:55:09 +00009976 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009977}
9978
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979/* Concat to string or Unicode object giving a new Unicode object. */
9980
Alexander Belopolsky40018472011-02-26 01:02:56 +00009981PyObject *
9982PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 PyObject *u = NULL, *v = NULL, *w;
9985 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
9987 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009990 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009993 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994
9995 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009996 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009997 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010000 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010001 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003 }
10004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010006 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 w = PyUnicode_New(
10010 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10011 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010014 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10015 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010016 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010017 v, 0,
10018 PyUnicode_GET_LENGTH(v)) < 0)
10019 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 Py_DECREF(u);
10021 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 Py_XDECREF(u);
10026 Py_XDECREF(v);
10027 return NULL;
10028}
10029
Victor Stinnerb0923652011-10-04 01:17:31 +020010030static void
10031unicode_append_inplace(PyObject **p_left, PyObject *right)
10032{
10033 Py_ssize_t left_len, right_len, new_len;
10034#ifdef Py_DEBUG
10035 Py_ssize_t copied;
10036#endif
10037
10038 assert(PyUnicode_IS_READY(*p_left));
10039 assert(PyUnicode_IS_READY(right));
10040
10041 left_len = PyUnicode_GET_LENGTH(*p_left);
10042 right_len = PyUnicode_GET_LENGTH(right);
10043 if (left_len > PY_SSIZE_T_MAX - right_len) {
10044 PyErr_SetString(PyExc_OverflowError,
10045 "strings are too large to concat");
10046 goto error;
10047 }
10048 new_len = left_len + right_len;
10049
10050 /* Now we own the last reference to 'left', so we can resize it
10051 * in-place.
10052 */
10053 if (unicode_resize(p_left, new_len) != 0) {
10054 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10055 * deallocated so it cannot be put back into
10056 * 'variable'. The MemoryError is raised when there
10057 * is no value in 'variable', which might (very
10058 * remotely) be a cause of incompatibilities.
10059 */
10060 goto error;
10061 }
10062 /* copy 'right' into the newly allocated area of 'left' */
10063#ifdef Py_DEBUG
10064 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10065 right, 0,
10066 right_len);
10067 assert(0 <= copied);
10068#else
10069 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10070#endif
10071 return;
10072
10073error:
10074 Py_DECREF(*p_left);
10075 *p_left = NULL;
10076}
10077
Walter Dörwald1ab83302007-05-18 17:15:44 +000010078void
Victor Stinner23e56682011-10-03 03:54:37 +020010079PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010080{
Victor Stinner23e56682011-10-03 03:54:37 +020010081 PyObject *left, *res;
10082
10083 if (p_left == NULL) {
10084 if (!PyErr_Occurred())
10085 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010086 return;
10087 }
Victor Stinner23e56682011-10-03 03:54:37 +020010088 left = *p_left;
10089 if (right == NULL || !PyUnicode_Check(left)) {
10090 if (!PyErr_Occurred())
10091 PyErr_BadInternalCall();
10092 goto error;
10093 }
10094
Victor Stinnere1335c72011-10-04 20:53:03 +020010095 if (PyUnicode_READY(left))
10096 goto error;
10097 if (PyUnicode_READY(right))
10098 goto error;
10099
Victor Stinner23e56682011-10-03 03:54:37 +020010100 if (PyUnicode_CheckExact(left) && left != unicode_empty
10101 && PyUnicode_CheckExact(right) && right != unicode_empty
10102 && unicode_resizable(left)
10103 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10104 || _PyUnicode_WSTR(left) != NULL))
10105 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010106 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10107 to change the structure size, but characters are stored just after
10108 the structure, and so it requires to move all charactres which is
10109 not so different than duplicating the string. */
10110 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010111 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010112 unicode_append_inplace(p_left, right);
Victor Stinner23e56682011-10-03 03:54:37 +020010113 return;
10114 }
10115 }
10116
10117 res = PyUnicode_Concat(left, right);
10118 if (res == NULL)
10119 goto error;
10120 Py_DECREF(left);
10121 *p_left = res;
10122 return;
10123
10124error:
10125 Py_DECREF(*p_left);
10126 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010127}
10128
10129void
10130PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 PyUnicode_Append(pleft, right);
10133 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010134}
10135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010136PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010139Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010140string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010141interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142
10143static PyObject *
10144unicode_count(PyUnicodeObject *self, PyObject *args)
10145{
10146 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010147 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010148 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 int kind1, kind2, kind;
10151 void *buf1, *buf2;
10152 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Jesus Ceaac451502011-04-20 17:09:23 +020010154 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10155 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 kind1 = PyUnicode_KIND(self);
10159 kind2 = PyUnicode_KIND(substring);
10160 kind = kind1 > kind2 ? kind1 : kind2;
10161 buf1 = PyUnicode_DATA(self);
10162 buf2 = PyUnicode_DATA(substring);
10163 if (kind1 != kind)
10164 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10165 if (!buf1) {
10166 Py_DECREF(substring);
10167 return NULL;
10168 }
10169 if (kind2 != kind)
10170 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10171 if (!buf2) {
10172 Py_DECREF(substring);
10173 if (kind1 != kind) PyMem_Free(buf1);
10174 return NULL;
10175 }
10176 len1 = PyUnicode_GET_LENGTH(self);
10177 len2 = PyUnicode_GET_LENGTH(substring);
10178
10179 ADJUST_INDICES(start, end, len1);
10180 switch(kind) {
10181 case PyUnicode_1BYTE_KIND:
10182 iresult = ucs1lib_count(
10183 ((Py_UCS1*)buf1) + start, end - start,
10184 buf2, len2, PY_SSIZE_T_MAX
10185 );
10186 break;
10187 case PyUnicode_2BYTE_KIND:
10188 iresult = ucs2lib_count(
10189 ((Py_UCS2*)buf1) + start, end - start,
10190 buf2, len2, PY_SSIZE_T_MAX
10191 );
10192 break;
10193 case PyUnicode_4BYTE_KIND:
10194 iresult = ucs4lib_count(
10195 ((Py_UCS4*)buf1) + start, end - start,
10196 buf2, len2, PY_SSIZE_T_MAX
10197 );
10198 break;
10199 default:
10200 assert(0); iresult = 0;
10201 }
10202
10203 result = PyLong_FromSsize_t(iresult);
10204
10205 if (kind1 != kind)
10206 PyMem_Free(buf1);
10207 if (kind2 != kind)
10208 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
10210 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010211
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212 return result;
10213}
10214
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010215PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010216 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010218Encode S using the codec registered for encoding. Default encoding\n\
10219is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010220handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010221a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10222'xmlcharrefreplace' as well as any other name registered with\n\
10223codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224
10225static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010226unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010228 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 char *encoding = NULL;
10230 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010231
Benjamin Peterson308d6372009-09-18 21:42:35 +000010232 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10233 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010235 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010236}
10237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010238PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240\n\
10241Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010242If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
10244static PyObject*
10245unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10246{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010247 Py_ssize_t i, j, line_pos, src_len, incr;
10248 Py_UCS4 ch;
10249 PyObject *u;
10250 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010252 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010253 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254
10255 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257
Antoine Pitrou22425222011-10-04 19:10:51 +020010258 if (PyUnicode_READY(self) == -1)
10259 return NULL;
10260
Thomas Wouters7e474022000-07-16 12:04:32 +000010261 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010262 src_len = PyUnicode_GET_LENGTH(self);
10263 i = j = line_pos = 0;
10264 kind = PyUnicode_KIND(self);
10265 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010266 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010267 for (; i < src_len; i++) {
10268 ch = PyUnicode_READ(kind, src_data, i);
10269 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010270 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010271 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010272 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010273 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010274 goto overflow;
10275 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010277 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010281 goto overflow;
10282 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010284 if (ch == '\n' || ch == '\r')
10285 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010287 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010288 if (!found && PyUnicode_CheckExact(self)) {
10289 Py_INCREF((PyObject *) self);
10290 return (PyObject *) self;
10291 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010292
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010294 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295 if (!u)
10296 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010297 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298
Antoine Pitroue71d5742011-10-04 15:55:09 +020010299 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300
Antoine Pitroue71d5742011-10-04 15:55:09 +020010301 for (; i < src_len; i++) {
10302 ch = PyUnicode_READ(kind, src_data, i);
10303 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010304 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010305 incr = tabsize - (line_pos % tabsize);
10306 line_pos += incr;
10307 while (incr--) {
10308 PyUnicode_WRITE(kind, dest_data, j, ' ');
10309 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010310 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010312 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010313 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010314 line_pos++;
10315 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010316 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010317 if (ch == '\n' || ch == '\r')
10318 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010320 }
10321 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010322#ifndef DONT_MAKE_RESULT_READY
10323 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 Py_DECREF(u);
10325 return NULL;
10326 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010327#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010329
Antoine Pitroue71d5742011-10-04 15:55:09 +020010330 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010331 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333}
10334
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010335PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010336 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337\n\
10338Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010339such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340arguments start and end are interpreted as in slice notation.\n\
10341\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343
10344static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346{
Jesus Ceaac451502011-04-20 17:09:23 +020010347 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010348 Py_ssize_t start;
10349 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010350 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351
Jesus Ceaac451502011-04-20 17:09:23 +020010352 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10353 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (PyUnicode_READY(self) == -1)
10357 return NULL;
10358 if (PyUnicode_READY(substring) == -1)
10359 return NULL;
10360
10361 result = any_find_slice(
10362 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10363 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010364 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365
10366 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (result == -2)
10369 return NULL;
10370
Christian Heimes217cfd12007-12-02 14:31:20 +000010371 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372}
10373
10374static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010375unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010377 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10378 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381}
10382
Guido van Rossumc2504932007-09-18 19:42:40 +000010383/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010384 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010385static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010386unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387{
Guido van Rossumc2504932007-09-18 19:42:40 +000010388 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010389 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 if (_PyUnicode_HASH(self) != -1)
10392 return _PyUnicode_HASH(self);
10393 if (PyUnicode_READY(self) == -1)
10394 return -1;
10395 len = PyUnicode_GET_LENGTH(self);
10396
10397 /* The hash function as a macro, gets expanded three times below. */
10398#define HASH(P) \
10399 x = (Py_uhash_t)*P << 7; \
10400 while (--len >= 0) \
10401 x = (1000003*x) ^ (Py_uhash_t)*P++;
10402
10403 switch (PyUnicode_KIND(self)) {
10404 case PyUnicode_1BYTE_KIND: {
10405 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10406 HASH(c);
10407 break;
10408 }
10409 case PyUnicode_2BYTE_KIND: {
10410 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10411 HASH(s);
10412 break;
10413 }
10414 default: {
10415 Py_UCS4 *l;
10416 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10417 "Impossible switch case in unicode_hash");
10418 l = PyUnicode_4BYTE_DATA(self);
10419 HASH(l);
10420 break;
10421 }
10422 }
10423 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10424
Guido van Rossumc2504932007-09-18 19:42:40 +000010425 if (x == -1)
10426 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010428 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010432PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010433 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010435Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436
10437static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010440 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010441 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010442 Py_ssize_t start;
10443 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444
Jesus Ceaac451502011-04-20 17:09:23 +020010445 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10446 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 if (PyUnicode_READY(self) == -1)
10450 return NULL;
10451 if (PyUnicode_READY(substring) == -1)
10452 return NULL;
10453
10454 result = any_find_slice(
10455 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10456 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
10459 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (result == -2)
10462 return NULL;
10463
Guido van Rossumd57fd912000-03-10 22:53:23 +000010464 if (result < 0) {
10465 PyErr_SetString(PyExc_ValueError, "substring not found");
10466 return NULL;
10467 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468
Christian Heimes217cfd12007-12-02 14:31:20 +000010469 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010470}
10471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010472PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010475Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010476at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
10478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010479unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 Py_ssize_t i, length;
10482 int kind;
10483 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484 int cased;
10485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 if (PyUnicode_READY(self) == -1)
10487 return NULL;
10488 length = PyUnicode_GET_LENGTH(self);
10489 kind = PyUnicode_KIND(self);
10490 data = PyUnicode_DATA(self);
10491
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (length == 1)
10494 return PyBool_FromLong(
10495 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010497 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010500
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 for (i = 0; i < length; i++) {
10503 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010504
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10506 return PyBool_FromLong(0);
10507 else if (!cased && Py_UNICODE_ISLOWER(ch))
10508 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010510 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511}
10512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010513PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010516Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010517at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518
10519static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010520unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 Py_ssize_t i, length;
10523 int kind;
10524 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 int cased;
10526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
10529 length = PyUnicode_GET_LENGTH(self);
10530 kind = PyUnicode_KIND(self);
10531 data = PyUnicode_DATA(self);
10532
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (length == 1)
10535 return PyBool_FromLong(
10536 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010538 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010541
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 for (i = 0; i < length; i++) {
10544 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010545
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10547 return PyBool_FromLong(0);
10548 else if (!cased && Py_UNICODE_ISUPPER(ch))
10549 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010551 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552}
10553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010554PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010555 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010557Return True if S is a titlecased string and there is at least one\n\
10558character in S, i.e. upper- and titlecase characters may only\n\
10559follow uncased characters and lowercase characters only cased ones.\n\
10560Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
10562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010563unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 Py_ssize_t i, length;
10566 int kind;
10567 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 int cased, previous_is_cased;
10569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 if (PyUnicode_READY(self) == -1)
10571 return NULL;
10572 length = PyUnicode_GET_LENGTH(self);
10573 kind = PyUnicode_KIND(self);
10574 data = PyUnicode_DATA(self);
10575
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (length == 1) {
10578 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10579 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10580 (Py_UNICODE_ISUPPER(ch) != 0));
10581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010583 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010586
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 cased = 0;
10588 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 for (i = 0; i < length; i++) {
10590 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010591
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10593 if (previous_is_cased)
10594 return PyBool_FromLong(0);
10595 previous_is_cased = 1;
10596 cased = 1;
10597 }
10598 else if (Py_UNICODE_ISLOWER(ch)) {
10599 if (!previous_is_cased)
10600 return PyBool_FromLong(0);
10601 previous_is_cased = 1;
10602 cased = 1;
10603 }
10604 else
10605 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010607 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608}
10609
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010610PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010613Return True if all characters in S are whitespace\n\
10614and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
10616static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010617unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 Py_ssize_t i, length;
10620 int kind;
10621 void *data;
10622
10623 if (PyUnicode_READY(self) == -1)
10624 return NULL;
10625 length = PyUnicode_GET_LENGTH(self);
10626 kind = PyUnicode_KIND(self);
10627 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 if (length == 1)
10631 return PyBool_FromLong(
10632 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010634 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 for (i = 0; i < length; i++) {
10639 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010640 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010641 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010643 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644}
10645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010646PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010648\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010649Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010650and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010651
10652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010653unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 Py_ssize_t i, length;
10656 int kind;
10657 void *data;
10658
10659 if (PyUnicode_READY(self) == -1)
10660 return NULL;
10661 length = PyUnicode_GET_LENGTH(self);
10662 kind = PyUnicode_KIND(self);
10663 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010664
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010665 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (length == 1)
10667 return PyBool_FromLong(
10668 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010669
10670 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010672 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 for (i = 0; i < length; i++) {
10675 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010676 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010677 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010678 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010679}
10680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010681PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010683\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010684Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010685and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010686
10687static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010688unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 int kind;
10691 void *data;
10692 Py_ssize_t len, i;
10693
10694 if (PyUnicode_READY(self) == -1)
10695 return NULL;
10696
10697 kind = PyUnicode_KIND(self);
10698 data = PyUnicode_DATA(self);
10699 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010700
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010701 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702 if (len == 1) {
10703 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10704 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10705 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010706
10707 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 for (i = 0; i < len; i++) {
10712 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010713 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010714 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010715 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010716 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010717}
10718
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010719PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010720 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010722Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010723False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724
10725static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010726unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 Py_ssize_t i, length;
10729 int kind;
10730 void *data;
10731
10732 if (PyUnicode_READY(self) == -1)
10733 return NULL;
10734 length = PyUnicode_GET_LENGTH(self);
10735 kind = PyUnicode_KIND(self);
10736 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (length == 1)
10740 return PyBool_FromLong(
10741 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010743 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 for (i = 0; i < length; i++) {
10748 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010751 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752}
10753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010754PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010757Return True if all characters in S are digits\n\
10758and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
10760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010761unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 Py_ssize_t i, length;
10764 int kind;
10765 void *data;
10766
10767 if (PyUnicode_READY(self) == -1)
10768 return NULL;
10769 length = PyUnicode_GET_LENGTH(self);
10770 kind = PyUnicode_KIND(self);
10771 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (length == 1) {
10775 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10776 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010779 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 for (i = 0; i < length; i++) {
10784 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010787 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788}
10789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010790PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010791 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010793Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010794False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795
10796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010797unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_ssize_t i, length;
10800 int kind;
10801 void *data;
10802
10803 if (PyUnicode_READY(self) == -1)
10804 return NULL;
10805 length = PyUnicode_GET_LENGTH(self);
10806 kind = PyUnicode_KIND(self);
10807 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 if (length == 1)
10811 return PyBool_FromLong(
10812 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010814 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 for (i = 0; i < length; i++) {
10819 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010822 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823}
10824
Martin v. Löwis47383402007-08-15 07:32:56 +000010825int
10826PyUnicode_IsIdentifier(PyObject *self)
10827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 int kind;
10829 void *data;
10830 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010831 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (PyUnicode_READY(self) == -1) {
10834 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 }
10837
10838 /* Special case for empty strings */
10839 if (PyUnicode_GET_LENGTH(self) == 0)
10840 return 0;
10841 kind = PyUnicode_KIND(self);
10842 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010843
10844 /* PEP 3131 says that the first character must be in
10845 XID_Start and subsequent characters in XID_Continue,
10846 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010848 letters, digits, underscore). However, given the current
10849 definition of XID_Start and XID_Continue, it is sufficient
10850 to check just for these, except that _ must be allowed
10851 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010852 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010853 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010854 return 0;
10855
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010856 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010858 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010859 return 1;
10860}
10861
10862PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010863 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010864\n\
10865Return True if S is a valid identifier according\n\
10866to the language definition.");
10867
10868static PyObject*
10869unicode_isidentifier(PyObject *self)
10870{
10871 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10872}
10873
Georg Brandl559e5d72008-06-11 18:37:52 +000010874PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010876\n\
10877Return True if all characters in S are considered\n\
10878printable in repr() or S is empty, False otherwise.");
10879
10880static PyObject*
10881unicode_isprintable(PyObject *self)
10882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 Py_ssize_t i, length;
10884 int kind;
10885 void *data;
10886
10887 if (PyUnicode_READY(self) == -1)
10888 return NULL;
10889 length = PyUnicode_GET_LENGTH(self);
10890 kind = PyUnicode_KIND(self);
10891 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010892
10893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (length == 1)
10895 return PyBool_FromLong(
10896 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010898 for (i = 0; i < length; i++) {
10899 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010900 Py_RETURN_FALSE;
10901 }
10902 }
10903 Py_RETURN_TRUE;
10904}
10905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010906PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010907 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908\n\
10909Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010910iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911
10912static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010913unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010914{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010915 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916}
10917
Martin v. Löwis18e16552006-02-15 17:27:45 +000010918static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919unicode_length(PyUnicodeObject *self)
10920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 if (PyUnicode_READY(self) == -1)
10922 return -1;
10923 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924}
10925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010926PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010927 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010929Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010930done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
10932static PyObject *
10933unicode_ljust(PyUnicodeObject *self, PyObject *args)
10934{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010935 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 Py_UCS4 fillchar = ' ';
10937
10938 if (PyUnicode_READY(self) == -1)
10939 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010940
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010941 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942 return NULL;
10943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945 Py_INCREF(self);
10946 return (PyObject*) self;
10947 }
10948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950}
10951
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010952PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010953 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010955Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956
10957static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010958unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960 return fixup(self, fixlower);
10961}
10962
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010963#define LEFTSTRIP 0
10964#define RIGHTSTRIP 1
10965#define BOTHSTRIP 2
10966
10967/* Arrays indexed by above */
10968static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10969
10970#define STRIPNAME(i) (stripformat[i]+3)
10971
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010972/* externally visible for str.strip(unicode) */
10973PyObject *
10974_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 void *data;
10977 int kind;
10978 Py_ssize_t i, j, len;
10979 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10982 return NULL;
10983
10984 kind = PyUnicode_KIND(self);
10985 data = PyUnicode_DATA(self);
10986 len = PyUnicode_GET_LENGTH(self);
10987 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10988 PyUnicode_DATA(sepobj),
10989 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010990
Benjamin Peterson14339b62009-01-31 16:36:08 +000010991 i = 0;
10992 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 while (i < len &&
10994 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 i++;
10996 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010997 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010998
Benjamin Peterson14339b62009-01-31 16:36:08 +000010999 j = len;
11000 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 do {
11002 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 } while (j >= i &&
11004 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011006 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011007
Victor Stinner12bab6d2011-10-01 01:53:49 +020011008 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009}
11010
11011PyObject*
11012PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11013{
11014 unsigned char *data;
11015 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011016 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017
Victor Stinnerde636f32011-10-01 03:55:54 +020011018 if (PyUnicode_READY(self) == -1)
11019 return NULL;
11020
11021 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11022
Victor Stinner12bab6d2011-10-01 01:53:49 +020011023 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011025 if (PyUnicode_CheckExact(self)) {
11026 Py_INCREF(self);
11027 return self;
11028 }
11029 else
11030 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 }
11032
Victor Stinner12bab6d2011-10-01 01:53:49 +020011033 length = end - start;
11034 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011035 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036
Victor Stinnerde636f32011-10-01 03:55:54 +020011037 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011038 PyErr_SetString(PyExc_IndexError, "string index out of range");
11039 return NULL;
11040 }
11041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 kind = PyUnicode_KIND(self);
11043 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020011044 return PyUnicode_FromKindAndData(kind,
11045 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020011046 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011050do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 int kind;
11053 void *data;
11054 Py_ssize_t len, i, j;
11055
11056 if (PyUnicode_READY(self) == -1)
11057 return NULL;
11058
11059 kind = PyUnicode_KIND(self);
11060 data = PyUnicode_DATA(self);
11061 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011062
Benjamin Peterson14339b62009-01-31 16:36:08 +000011063 i = 0;
11064 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011066 i++;
11067 }
11068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011069
Benjamin Peterson14339b62009-01-31 16:36:08 +000011070 j = len;
11071 if (striptype != LEFTSTRIP) {
11072 do {
11073 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011075 j++;
11076 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011077
Victor Stinner12bab6d2011-10-01 01:53:49 +020011078 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079}
11080
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011081
11082static PyObject *
11083do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11084{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011085 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011086
Benjamin Peterson14339b62009-01-31 16:36:08 +000011087 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11088 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011089
Benjamin Peterson14339b62009-01-31 16:36:08 +000011090 if (sep != NULL && sep != Py_None) {
11091 if (PyUnicode_Check(sep))
11092 return _PyUnicode_XStrip(self, striptype, sep);
11093 else {
11094 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 "%s arg must be None or str",
11096 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011097 return NULL;
11098 }
11099 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011100
Benjamin Peterson14339b62009-01-31 16:36:08 +000011101 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011102}
11103
11104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011105PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011107\n\
11108Return a copy of the string S with leading and trailing\n\
11109whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011110If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011111
11112static PyObject *
11113unicode_strip(PyUnicodeObject *self, PyObject *args)
11114{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011115 if (PyTuple_GET_SIZE(args) == 0)
11116 return do_strip(self, BOTHSTRIP); /* Common case */
11117 else
11118 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011119}
11120
11121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011122PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011123 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011124\n\
11125Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011126If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011127
11128static PyObject *
11129unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11130{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011131 if (PyTuple_GET_SIZE(args) == 0)
11132 return do_strip(self, LEFTSTRIP); /* Common case */
11133 else
11134 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011135}
11136
11137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011140\n\
11141Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011142If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011143
11144static PyObject *
11145unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011147 if (PyTuple_GET_SIZE(args) == 0)
11148 return do_strip(self, RIGHTSTRIP); /* Common case */
11149 else
11150 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011151}
11152
11153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011155unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156{
11157 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
Georg Brandl222de0f2009-04-12 12:01:50 +000011160 if (len < 1) {
11161 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011162 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011163 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164
Tim Peters7a29bd52001-09-12 03:03:31 +000011165 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 /* no repeat, return original string */
11167 Py_INCREF(str);
11168 return (PyObject*) str;
11169 }
Tim Peters8f422462000-09-09 06:13:41 +000011170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (PyUnicode_READY(str) == -1)
11172 return NULL;
11173
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011174 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011175 PyErr_SetString(PyExc_OverflowError,
11176 "repeated string is too long");
11177 return NULL;
11178 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 if (!u)
11183 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011184 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 if (PyUnicode_GET_LENGTH(str) == 1) {
11187 const int kind = PyUnicode_KIND(str);
11188 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11189 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011190 if (kind == PyUnicode_1BYTE_KIND)
11191 memset(to, (unsigned char)fill_char, len);
11192 else {
11193 for (n = 0; n < len; ++n)
11194 PyUnicode_WRITE(kind, to, n, fill_char);
11195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 }
11197 else {
11198 /* number of characters copied this far */
11199 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11200 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11201 char *to = (char *) PyUnicode_DATA(u);
11202 Py_MEMCPY(to, PyUnicode_DATA(str),
11203 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 n = (done <= nchars-done) ? done : nchars-done;
11206 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011207 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 }
11210
11211 return (PyObject*) u;
11212}
11213
Alexander Belopolsky40018472011-02-26 01:02:56 +000011214PyObject *
11215PyUnicode_Replace(PyObject *obj,
11216 PyObject *subobj,
11217 PyObject *replobj,
11218 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219{
11220 PyObject *self;
11221 PyObject *str1;
11222 PyObject *str2;
11223 PyObject *result;
11224
11225 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011226 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011229 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 Py_DECREF(self);
11231 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 }
11233 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011234 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011235 Py_DECREF(self);
11236 Py_DECREF(str1);
11237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 Py_DECREF(self);
11241 Py_DECREF(str1);
11242 Py_DECREF(str2);
11243 return result;
11244}
11245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011246PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011247 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248\n\
11249Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011250old replaced by new. If the optional argument count is\n\
11251given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252
11253static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 PyObject *str1;
11257 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011258 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 PyObject *result;
11260
Martin v. Löwis18e16552006-02-15 17:27:45 +000011261 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011263 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 str1 = PyUnicode_FromObject(str1);
11266 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11267 return NULL;
11268 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011269 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 Py_DECREF(str1);
11271 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
11274 result = replace(self, str1, str2, maxcount);
11275
11276 Py_DECREF(str1);
11277 Py_DECREF(str2);
11278 return result;
11279}
11280
Alexander Belopolsky40018472011-02-26 01:02:56 +000011281static PyObject *
11282unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011284 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t isize;
11286 Py_ssize_t osize, squote, dquote, i, o;
11287 Py_UCS4 max, quote;
11288 int ikind, okind;
11289 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011292 return NULL;
11293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 isize = PyUnicode_GET_LENGTH(unicode);
11295 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 /* Compute length of output, quote characters, and
11298 maximum character */
11299 osize = 2; /* quotes */
11300 max = 127;
11301 squote = dquote = 0;
11302 ikind = PyUnicode_KIND(unicode);
11303 for (i = 0; i < isize; i++) {
11304 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11305 switch (ch) {
11306 case '\'': squote++; osize++; break;
11307 case '"': dquote++; osize++; break;
11308 case '\\': case '\t': case '\r': case '\n':
11309 osize += 2; break;
11310 default:
11311 /* Fast-path ASCII */
11312 if (ch < ' ' || ch == 0x7f)
11313 osize += 4; /* \xHH */
11314 else if (ch < 0x7f)
11315 osize++;
11316 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11317 osize++;
11318 max = ch > max ? ch : max;
11319 }
11320 else if (ch < 0x100)
11321 osize += 4; /* \xHH */
11322 else if (ch < 0x10000)
11323 osize += 6; /* \uHHHH */
11324 else
11325 osize += 10; /* \uHHHHHHHH */
11326 }
11327 }
11328
11329 quote = '\'';
11330 if (squote) {
11331 if (dquote)
11332 /* Both squote and dquote present. Use squote,
11333 and escape them */
11334 osize += squote;
11335 else
11336 quote = '"';
11337 }
11338
11339 repr = PyUnicode_New(osize, max);
11340 if (repr == NULL)
11341 return NULL;
11342 okind = PyUnicode_KIND(repr);
11343 odata = PyUnicode_DATA(repr);
11344
11345 PyUnicode_WRITE(okind, odata, 0, quote);
11346 PyUnicode_WRITE(okind, odata, osize-1, quote);
11347
11348 for (i = 0, o = 1; i < isize; i++) {
11349 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011350
11351 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 if ((ch == quote) || (ch == '\\')) {
11353 PyUnicode_WRITE(okind, odata, o++, '\\');
11354 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011355 continue;
11356 }
11357
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011359 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 PyUnicode_WRITE(okind, odata, o++, '\\');
11361 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011362 }
11363 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 PyUnicode_WRITE(okind, odata, o++, '\\');
11365 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011366 }
11367 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 PyUnicode_WRITE(okind, odata, o++, '\\');
11369 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011370 }
11371
11372 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011373 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 PyUnicode_WRITE(okind, odata, o++, '\\');
11375 PyUnicode_WRITE(okind, odata, o++, 'x');
11376 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11377 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011378 }
11379
Georg Brandl559e5d72008-06-11 18:37:52 +000011380 /* Copy ASCII characters as-is */
11381 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011383 }
11384
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011386 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011387 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011388 (categories Z* and C* except ASCII space)
11389 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011391 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (ch <= 0xff) {
11393 PyUnicode_WRITE(okind, odata, o++, '\\');
11394 PyUnicode_WRITE(okind, odata, o++, 'x');
11395 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11396 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011397 }
11398 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 else if (ch >= 0x10000) {
11400 PyUnicode_WRITE(okind, odata, o++, '\\');
11401 PyUnicode_WRITE(okind, odata, o++, 'U');
11402 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11403 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11404 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11405 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11406 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11407 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11408 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11409 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011410 }
11411 /* Map 16-bit characters to '\uxxxx' */
11412 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 PyUnicode_WRITE(okind, odata, o++, '\\');
11414 PyUnicode_WRITE(okind, odata, o++, 'u');
11415 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11416 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11417 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11418 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011419 }
11420 }
11421 /* Copy characters as-is */
11422 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011424 }
11425 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011428 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429}
11430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011431PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433\n\
11434Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011435such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011436arguments start and end are interpreted as in slice notation.\n\
11437\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
11440static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442{
Jesus Ceaac451502011-04-20 17:09:23 +020011443 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011444 Py_ssize_t start;
11445 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011446 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447
Jesus Ceaac451502011-04-20 17:09:23 +020011448 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11449 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (PyUnicode_READY(self) == -1)
11453 return NULL;
11454 if (PyUnicode_READY(substring) == -1)
11455 return NULL;
11456
11457 result = any_find_slice(
11458 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11459 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011460 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (result == -2)
11465 return NULL;
11466
Christian Heimes217cfd12007-12-02 14:31:20 +000011467 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468}
11469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011470PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474
11475static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477{
Jesus Ceaac451502011-04-20 17:09:23 +020011478 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011479 Py_ssize_t start;
11480 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011481 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482
Jesus Ceaac451502011-04-20 17:09:23 +020011483 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11484 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (PyUnicode_READY(self) == -1)
11488 return NULL;
11489 if (PyUnicode_READY(substring) == -1)
11490 return NULL;
11491
11492 result = any_find_slice(
11493 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11494 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011495 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
11497 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (result == -2)
11500 return NULL;
11501
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 if (result < 0) {
11503 PyErr_SetString(PyExc_ValueError, "substring not found");
11504 return NULL;
11505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506
Christian Heimes217cfd12007-12-02 14:31:20 +000011507 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508}
11509
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011510PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011513Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011514done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515
11516static PyObject *
11517unicode_rjust(PyUnicodeObject *self, PyObject *args)
11518{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011519 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 Py_UCS4 fillchar = ' ';
11521
Victor Stinnere9a29352011-10-01 02:14:59 +020011522 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011524
Victor Stinnere9a29352011-10-01 02:14:59 +020011525 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 return NULL;
11527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 Py_INCREF(self);
11530 return (PyObject*) self;
11531 }
11532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534}
11535
Alexander Belopolsky40018472011-02-26 01:02:56 +000011536PyObject *
11537PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538{
11539 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011540
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541 s = PyUnicode_FromObject(s);
11542 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011543 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 if (sep != NULL) {
11545 sep = PyUnicode_FromObject(sep);
11546 if (sep == NULL) {
11547 Py_DECREF(s);
11548 return NULL;
11549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550 }
11551
11552 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11553
11554 Py_DECREF(s);
11555 Py_XDECREF(sep);
11556 return result;
11557}
11558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011559PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561\n\
11562Return a list of the words in S, using sep as the\n\
11563delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011564splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011565whitespace string is a separator and empty strings are\n\
11566removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
11568static PyObject*
11569unicode_split(PyUnicodeObject *self, PyObject *args)
11570{
11571 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011572 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Martin v. Löwis18e16552006-02-15 17:27:45 +000011574 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 return NULL;
11576
11577 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583}
11584
Thomas Wouters477c8d52006-05-27 19:21:47 +000011585PyObject *
11586PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11587{
11588 PyObject* str_obj;
11589 PyObject* sep_obj;
11590 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 int kind1, kind2, kind;
11592 void *buf1 = NULL, *buf2 = NULL;
11593 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011594
11595 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011596 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011598 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011599 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011600 Py_DECREF(str_obj);
11601 return NULL;
11602 }
11603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 kind1 = PyUnicode_KIND(str_in);
11605 kind2 = PyUnicode_KIND(sep_obj);
11606 kind = kind1 > kind2 ? kind1 : kind2;
11607 buf1 = PyUnicode_DATA(str_in);
11608 if (kind1 != kind)
11609 buf1 = _PyUnicode_AsKind(str_in, kind);
11610 if (!buf1)
11611 goto onError;
11612 buf2 = PyUnicode_DATA(sep_obj);
11613 if (kind2 != kind)
11614 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11615 if (!buf2)
11616 goto onError;
11617 len1 = PyUnicode_GET_LENGTH(str_obj);
11618 len2 = PyUnicode_GET_LENGTH(sep_obj);
11619
11620 switch(PyUnicode_KIND(str_in)) {
11621 case PyUnicode_1BYTE_KIND:
11622 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11623 break;
11624 case PyUnicode_2BYTE_KIND:
11625 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11626 break;
11627 case PyUnicode_4BYTE_KIND:
11628 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11629 break;
11630 default:
11631 assert(0);
11632 out = 0;
11633 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634
11635 Py_DECREF(sep_obj);
11636 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (kind1 != kind)
11638 PyMem_Free(buf1);
11639 if (kind2 != kind)
11640 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011641
11642 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 onError:
11644 Py_DECREF(sep_obj);
11645 Py_DECREF(str_obj);
11646 if (kind1 != kind && buf1)
11647 PyMem_Free(buf1);
11648 if (kind2 != kind && buf2)
11649 PyMem_Free(buf2);
11650 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011651}
11652
11653
11654PyObject *
11655PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11656{
11657 PyObject* str_obj;
11658 PyObject* sep_obj;
11659 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660 int kind1, kind2, kind;
11661 void *buf1 = NULL, *buf2 = NULL;
11662 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011663
11664 str_obj = PyUnicode_FromObject(str_in);
11665 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011667 sep_obj = PyUnicode_FromObject(sep_in);
11668 if (!sep_obj) {
11669 Py_DECREF(str_obj);
11670 return NULL;
11671 }
11672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 kind1 = PyUnicode_KIND(str_in);
11674 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011675 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 buf1 = PyUnicode_DATA(str_in);
11677 if (kind1 != kind)
11678 buf1 = _PyUnicode_AsKind(str_in, kind);
11679 if (!buf1)
11680 goto onError;
11681 buf2 = PyUnicode_DATA(sep_obj);
11682 if (kind2 != kind)
11683 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11684 if (!buf2)
11685 goto onError;
11686 len1 = PyUnicode_GET_LENGTH(str_obj);
11687 len2 = PyUnicode_GET_LENGTH(sep_obj);
11688
11689 switch(PyUnicode_KIND(str_in)) {
11690 case PyUnicode_1BYTE_KIND:
11691 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11692 break;
11693 case PyUnicode_2BYTE_KIND:
11694 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11695 break;
11696 case PyUnicode_4BYTE_KIND:
11697 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11698 break;
11699 default:
11700 assert(0);
11701 out = 0;
11702 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703
11704 Py_DECREF(sep_obj);
11705 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (kind1 != kind)
11707 PyMem_Free(buf1);
11708 if (kind2 != kind)
11709 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011710
11711 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 onError:
11713 Py_DECREF(sep_obj);
11714 Py_DECREF(str_obj);
11715 if (kind1 != kind && buf1)
11716 PyMem_Free(buf1);
11717 if (kind2 != kind && buf2)
11718 PyMem_Free(buf2);
11719 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011720}
11721
11722PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011724\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011725Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011726the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011727found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011728
11729static PyObject*
11730unicode_partition(PyUnicodeObject *self, PyObject *separator)
11731{
11732 return PyUnicode_Partition((PyObject *)self, separator);
11733}
11734
11735PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011736 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011737\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011738Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011739the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011740separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011741
11742static PyObject*
11743unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11744{
11745 return PyUnicode_RPartition((PyObject *)self, separator);
11746}
11747
Alexander Belopolsky40018472011-02-26 01:02:56 +000011748PyObject *
11749PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011750{
11751 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011753 s = PyUnicode_FromObject(s);
11754 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011755 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 if (sep != NULL) {
11757 sep = PyUnicode_FromObject(sep);
11758 if (sep == NULL) {
11759 Py_DECREF(s);
11760 return NULL;
11761 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011762 }
11763
11764 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11765
11766 Py_DECREF(s);
11767 Py_XDECREF(sep);
11768 return result;
11769}
11770
11771PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011773\n\
11774Return a list of the words in S, using sep as the\n\
11775delimiter string, starting at the end of the string and\n\
11776working to the front. If maxsplit is given, at most maxsplit\n\
11777splits are done. If sep is not specified, any whitespace string\n\
11778is a separator.");
11779
11780static PyObject*
11781unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11782{
11783 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011784 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011785
Martin v. Löwis18e16552006-02-15 17:27:45 +000011786 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011787 return NULL;
11788
11789 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011791 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011793 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011794 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011795}
11796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011797PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011798 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799\n\
11800Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011801Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011802is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011805unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011807 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011808 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011810 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11811 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 return NULL;
11813
Guido van Rossum86662912000-04-11 15:38:46 +000011814 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815}
11816
11817static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011818PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819{
Walter Dörwald346737f2007-05-31 10:44:43 +000011820 if (PyUnicode_CheckExact(self)) {
11821 Py_INCREF(self);
11822 return self;
11823 } else
11824 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011825 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826}
11827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011828PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830\n\
11831Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833
11834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011835unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 return fixup(self, fixswapcase);
11838}
11839
Georg Brandlceee0772007-11-27 23:48:05 +000011840PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011842\n\
11843Return a translation table usable for str.translate().\n\
11844If there is only one argument, it must be a dictionary mapping Unicode\n\
11845ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011846Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011847If there are two arguments, they must be strings of equal length, and\n\
11848in the resulting dictionary, each character in x will be mapped to the\n\
11849character at the same position in y. If there is a third argument, it\n\
11850must be a string, whose characters will be mapped to None in the result.");
11851
11852static PyObject*
11853unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11854{
11855 PyObject *x, *y = NULL, *z = NULL;
11856 PyObject *new = NULL, *key, *value;
11857 Py_ssize_t i = 0;
11858 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011859
Georg Brandlceee0772007-11-27 23:48:05 +000011860 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11861 return NULL;
11862 new = PyDict_New();
11863 if (!new)
11864 return NULL;
11865 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 int x_kind, y_kind, z_kind;
11867 void *x_data, *y_data, *z_data;
11868
Georg Brandlceee0772007-11-27 23:48:05 +000011869 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011870 if (!PyUnicode_Check(x)) {
11871 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11872 "be a string if there is a second argument");
11873 goto err;
11874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011876 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11877 "arguments must have equal length");
11878 goto err;
11879 }
11880 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 x_kind = PyUnicode_KIND(x);
11882 y_kind = PyUnicode_KIND(y);
11883 x_data = PyUnicode_DATA(x);
11884 y_data = PyUnicode_DATA(y);
11885 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11886 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11887 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011888 if (!key || !value)
11889 goto err;
11890 res = PyDict_SetItem(new, key, value);
11891 Py_DECREF(key);
11892 Py_DECREF(value);
11893 if (res < 0)
11894 goto err;
11895 }
11896 /* create entries for deleting chars in z */
11897 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 z_kind = PyUnicode_KIND(z);
11899 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011900 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011902 if (!key)
11903 goto err;
11904 res = PyDict_SetItem(new, key, Py_None);
11905 Py_DECREF(key);
11906 if (res < 0)
11907 goto err;
11908 }
11909 }
11910 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 int kind;
11912 void *data;
11913
Georg Brandlceee0772007-11-27 23:48:05 +000011914 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011915 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011916 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11917 "to maketrans it must be a dict");
11918 goto err;
11919 }
11920 /* copy entries into the new dict, converting string keys to int keys */
11921 while (PyDict_Next(x, &i, &key, &value)) {
11922 if (PyUnicode_Check(key)) {
11923 /* convert string keys to integer keys */
11924 PyObject *newkey;
11925 if (PyUnicode_GET_SIZE(key) != 1) {
11926 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11927 "table must be of length 1");
11928 goto err;
11929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 kind = PyUnicode_KIND(key);
11931 data = PyUnicode_DATA(key);
11932 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011933 if (!newkey)
11934 goto err;
11935 res = PyDict_SetItem(new, newkey, value);
11936 Py_DECREF(newkey);
11937 if (res < 0)
11938 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011939 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011940 /* just keep integer keys */
11941 if (PyDict_SetItem(new, key, value) < 0)
11942 goto err;
11943 } else {
11944 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11945 "be strings or integers");
11946 goto err;
11947 }
11948 }
11949 }
11950 return new;
11951 err:
11952 Py_DECREF(new);
11953 return NULL;
11954}
11955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958\n\
11959Return a copy of the string S, where all characters have been mapped\n\
11960through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011961Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011962Unmapped characters are left untouched. Characters mapped to None\n\
11963are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964
11965static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969}
11970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011971PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011972 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011974Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
11976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011977unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979 return fixup(self, fixupper);
11980}
11981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011982PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011985Pad a numeric string S with zeros on the left, to fill a field\n\
11986of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
11988static PyObject *
11989unicode_zfill(PyUnicodeObject *self, PyObject *args)
11990{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011991 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011993 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 int kind;
11995 void *data;
11996 Py_UCS4 chr;
11997
11998 if (PyUnicode_READY(self) == -1)
11999 return NULL;
12000
Martin v. Löwis18e16552006-02-15 17:27:45 +000012001 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 return NULL;
12003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012005 if (PyUnicode_CheckExact(self)) {
12006 Py_INCREF(self);
12007 return (PyObject*) self;
12008 }
12009 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012010 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012011 }
12012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014
12015 u = pad(self, fill, 0, '0');
12016
Walter Dörwald068325e2002-04-15 13:36:47 +000012017 if (u == NULL)
12018 return NULL;
12019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 kind = PyUnicode_KIND(u);
12021 data = PyUnicode_DATA(u);
12022 chr = PyUnicode_READ(kind, data, fill);
12023
12024 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012025 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026 PyUnicode_WRITE(kind, data, 0, chr);
12027 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 }
12029
12030 return (PyObject*) u;
12031}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
12033#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012034static PyObject *
12035unicode__decimal2ascii(PyObject *self)
12036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012038}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039#endif
12040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012041PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012042 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012044Return True if S starts with the specified prefix, False otherwise.\n\
12045With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012046With optional end, stop comparing S at that position.\n\
12047prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048
12049static PyObject *
12050unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012053 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012055 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012056 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012057 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
Jesus Ceaac451502011-04-20 17:09:23 +020012059 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012061 if (PyTuple_Check(subobj)) {
12062 Py_ssize_t i;
12063 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12064 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012065 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012066 if (substring == NULL)
12067 return NULL;
12068 result = tailmatch(self, substring, start, end, -1);
12069 Py_DECREF(substring);
12070 if (result) {
12071 Py_RETURN_TRUE;
12072 }
12073 }
12074 /* nothing matched */
12075 Py_RETURN_FALSE;
12076 }
12077 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012078 if (substring == NULL) {
12079 if (PyErr_ExceptionMatches(PyExc_TypeError))
12080 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12081 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012082 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012083 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012084 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012086 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087}
12088
12089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012090PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012093Return True if S ends with the specified suffix, False otherwise.\n\
12094With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012095With optional end, stop comparing S at that position.\n\
12096suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097
12098static PyObject *
12099unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012100 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012102 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012104 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012105 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012106 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Jesus Ceaac451502011-04-20 17:09:23 +020012108 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012110 if (PyTuple_Check(subobj)) {
12111 Py_ssize_t i;
12112 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12113 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012114 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012115 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012117 result = tailmatch(self, substring, start, end, +1);
12118 Py_DECREF(substring);
12119 if (result) {
12120 Py_RETURN_TRUE;
12121 }
12122 }
12123 Py_RETURN_FALSE;
12124 }
12125 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012126 if (substring == NULL) {
12127 if (PyErr_ExceptionMatches(PyExc_TypeError))
12128 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12129 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012131 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012132 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012134 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135}
12136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012138
12139PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012141\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012142Return a formatted version of S, using substitutions from args and kwargs.\n\
12143The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012144
Eric Smith27bbca62010-11-04 17:06:58 +000012145PyDoc_STRVAR(format_map__doc__,
12146 "S.format_map(mapping) -> str\n\
12147\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012148Return a formatted version of S, using substitutions from mapping.\n\
12149The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012150
Eric Smith4a7d76d2008-05-30 18:10:19 +000012151static PyObject *
12152unicode__format__(PyObject* self, PyObject* args)
12153{
12154 PyObject *format_spec;
12155
12156 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12157 return NULL;
12158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12160 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000012161}
12162
Eric Smith8c663262007-08-25 02:26:07 +000012163PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012165\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012166Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012167
12168static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012169unicode__sizeof__(PyUnicodeObject *v)
12170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012171 Py_ssize_t size;
12172
12173 /* If it's a compact object, account for base structure +
12174 character data. */
12175 if (PyUnicode_IS_COMPACT_ASCII(v))
12176 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12177 else if (PyUnicode_IS_COMPACT(v))
12178 size = sizeof(PyCompactUnicodeObject) +
12179 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12180 else {
12181 /* If it is a two-block object, account for base object, and
12182 for character block if present. */
12183 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012184 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 size += (PyUnicode_GET_LENGTH(v) + 1) *
12186 PyUnicode_CHARACTER_SIZE(v);
12187 }
12188 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012189 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012190 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012192 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012193 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194
12195 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012196}
12197
12198PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012199 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012200
12201static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012202unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012203{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012204 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (!copy)
12206 return NULL;
12207 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012208}
12209
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210static PyMethodDef unicode_methods[] = {
12211
12212 /* Order is according to common usage: often used methods should
12213 appear first, since lookup is done sequentially. */
12214
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012215 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012216 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12217 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012218 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012219 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12220 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12221 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12222 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12223 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12224 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12225 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012227 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12228 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12229 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012230 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012231 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12232 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12233 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012234 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012235 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012236 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012237 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012238 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12239 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12240 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12241 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12242 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12243 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12244 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12245 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12246 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12247 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12248 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12249 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12250 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12251 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012252 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012253 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012254 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012255 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012256 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012257 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012258 {"maketrans", (PyCFunction) unicode_maketrans,
12259 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012260 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012261#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012262 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263#endif
12264
12265#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012266 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012267 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268#endif
12269
Benjamin Peterson14339b62009-01-31 16:36:08 +000012270 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 {NULL, NULL}
12272};
12273
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012274static PyObject *
12275unicode_mod(PyObject *v, PyObject *w)
12276{
Brian Curtindfc80e32011-08-10 20:28:54 -050012277 if (!PyUnicode_Check(v))
12278 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012280}
12281
12282static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012283 0, /*nb_add*/
12284 0, /*nb_subtract*/
12285 0, /*nb_multiply*/
12286 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012287};
12288
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012290 (lenfunc) unicode_length, /* sq_length */
12291 PyUnicode_Concat, /* sq_concat */
12292 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12293 (ssizeargfunc) unicode_getitem, /* sq_item */
12294 0, /* sq_slice */
12295 0, /* sq_ass_item */
12296 0, /* sq_ass_slice */
12297 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298};
12299
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012300static PyObject*
12301unicode_subscript(PyUnicodeObject* self, PyObject* item)
12302{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (PyUnicode_READY(self) == -1)
12304 return NULL;
12305
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012306 if (PyIndex_Check(item)) {
12307 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012308 if (i == -1 && PyErr_Occurred())
12309 return NULL;
12310 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012312 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012313 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012314 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012315 PyObject *result;
12316 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012317 int src_kind, dest_kind;
12318 Py_UCS4 ch, max_char;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012321 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012322 return NULL;
12323 }
12324
12325 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 return PyUnicode_New(0, 0);
12327 } else if (start == 0 && step == 1 &&
12328 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012329 PyUnicode_CheckExact(self)) {
12330 Py_INCREF(self);
12331 return (PyObject *)self;
12332 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012333 return PyUnicode_Substring((PyObject*)self,
12334 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012335 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012336 /* General case */
12337 max_char = 127;
12338 src_kind = PyUnicode_KIND(self);
12339 src_data = PyUnicode_DATA(self);
12340 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12341 ch = PyUnicode_READ(src_kind, src_data, cur);
12342 if (ch > max_char)
12343 max_char = ch;
12344 }
12345 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012346 if (result == NULL)
12347 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012348 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012349 dest_data = PyUnicode_DATA(result);
12350
12351 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012352 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12353 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012354 }
12355 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012356 } else {
12357 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12358 return NULL;
12359 }
12360}
12361
12362static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012363 (lenfunc)unicode_length, /* mp_length */
12364 (binaryfunc)unicode_subscript, /* mp_subscript */
12365 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012366};
12367
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369/* Helpers for PyUnicode_Format() */
12370
12371static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012372getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012374 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 (*p_argidx)++;
12377 if (arglen < 0)
12378 return args;
12379 else
12380 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 }
12382 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 return NULL;
12385}
12386
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012387/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012389static PyObject *
12390formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012392 char *p;
12393 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012395
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396 x = PyFloat_AsDouble(v);
12397 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012398 return NULL;
12399
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012401 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012402
Eric Smith0923d1d2009-04-16 20:16:10 +000012403 p = PyOS_double_to_string(x, type, prec,
12404 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012405 if (p == NULL)
12406 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012408 PyMem_Free(p);
12409 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410}
12411
Tim Peters38fd5b62000-09-21 05:43:11 +000012412static PyObject*
12413formatlong(PyObject *val, int flags, int prec, int type)
12414{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012415 char *buf;
12416 int len;
12417 PyObject *str; /* temporary string object. */
12418 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012419
Benjamin Peterson14339b62009-01-31 16:36:08 +000012420 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12421 if (!str)
12422 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 Py_DECREF(str);
12425 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012426}
12427
Guido van Rossumd57fd912000-03-10 22:53:23 +000012428static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012430 size_t buflen,
12431 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012432{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012433 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012434 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 if (PyUnicode_GET_LENGTH(v) == 1) {
12436 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012437 buf[1] = '\0';
12438 return 1;
12439 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012440 goto onError;
12441 }
12442 else {
12443 /* Integer input truncated to a character */
12444 long x;
12445 x = PyLong_AsLong(v);
12446 if (x == -1 && PyErr_Occurred())
12447 goto onError;
12448
12449 if (x < 0 || x > 0x10ffff) {
12450 PyErr_SetString(PyExc_OverflowError,
12451 "%c arg not in range(0x110000)");
12452 return -1;
12453 }
12454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012456 buf[1] = '\0';
12457 return 1;
12458 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012459
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012461 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012463 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464}
12465
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012466/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012467 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012468*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012469#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012470
Alexander Belopolsky40018472011-02-26 01:02:56 +000012471PyObject *
12472PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012474 void *fmt;
12475 int fmtkind;
12476 PyObject *result;
12477 Py_UCS4 *res, *res0;
12478 Py_UCS4 max;
12479 int kind;
12480 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012484
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 PyErr_BadInternalCall();
12487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12490 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 fmt = PyUnicode_DATA(uformat);
12493 fmtkind = PyUnicode_KIND(uformat);
12494 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12495 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
12497 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12499 if (res0 == NULL) {
12500 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
12504 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 arglen = PyTuple_Size(args);
12506 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012507 }
12508 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012509 arglen = -1;
12510 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012512 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012513 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 if (--rescnt < 0) {
12519 rescnt = fmtcnt + 100;
12520 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12522 if (res0 == NULL){
12523 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 }
12526 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012527 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012530 }
12531 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 /* Got a format specifier */
12533 int flags = 0;
12534 Py_ssize_t width = -1;
12535 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 Py_UCS4 c = '\0';
12537 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 int isnumok;
12539 PyObject *v = NULL;
12540 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 void *pbuf;
12542 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012543 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 Py_ssize_t len, len1;
12545 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 fmtpos++;
12548 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12549 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 Py_ssize_t keylen;
12551 PyObject *key;
12552 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012553
Benjamin Peterson29060642009-01-31 22:14:21 +000012554 if (dict == NULL) {
12555 PyErr_SetString(PyExc_TypeError,
12556 "format requires a mapping");
12557 goto onError;
12558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 /* Skip over balanced parentheses */
12563 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012565 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 if (fmtcnt < 0 || pcount > 0) {
12572 PyErr_SetString(PyExc_ValueError,
12573 "incomplete format key");
12574 goto onError;
12575 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012576 key = PyUnicode_Substring((PyObject*)uformat,
12577 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 if (key == NULL)
12579 goto onError;
12580 if (args_owned) {
12581 Py_DECREF(args);
12582 args_owned = 0;
12583 }
12584 args = PyObject_GetItem(dict, key);
12585 Py_DECREF(key);
12586 if (args == NULL) {
12587 goto onError;
12588 }
12589 args_owned = 1;
12590 arglen = -1;
12591 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 case '-': flags |= F_LJUST; continue;
12596 case '+': flags |= F_SIGN; continue;
12597 case ' ': flags |= F_BLANK; continue;
12598 case '#': flags |= F_ALT; continue;
12599 case '0': flags |= F_ZERO; continue;
12600 }
12601 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 if (c == '*') {
12604 v = getnextarg(args, arglen, &argidx);
12605 if (v == NULL)
12606 goto onError;
12607 if (!PyLong_Check(v)) {
12608 PyErr_SetString(PyExc_TypeError,
12609 "* wants int");
12610 goto onError;
12611 }
12612 width = PyLong_AsLong(v);
12613 if (width == -1 && PyErr_Occurred())
12614 goto onError;
12615 if (width < 0) {
12616 flags |= F_LJUST;
12617 width = -width;
12618 }
12619 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 }
12622 else if (c >= '0' && c <= '9') {
12623 width = c - '0';
12624 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 if (c < '0' || c > '9')
12627 break;
12628 if ((width*10) / 10 != width) {
12629 PyErr_SetString(PyExc_ValueError,
12630 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012631 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 }
12633 width = width*10 + (c - '0');
12634 }
12635 }
12636 if (c == '.') {
12637 prec = 0;
12638 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 if (c == '*') {
12641 v = getnextarg(args, arglen, &argidx);
12642 if (v == NULL)
12643 goto onError;
12644 if (!PyLong_Check(v)) {
12645 PyErr_SetString(PyExc_TypeError,
12646 "* wants int");
12647 goto onError;
12648 }
12649 prec = PyLong_AsLong(v);
12650 if (prec == -1 && PyErr_Occurred())
12651 goto onError;
12652 if (prec < 0)
12653 prec = 0;
12654 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 }
12657 else if (c >= '0' && c <= '9') {
12658 prec = c - '0';
12659 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012661 if (c < '0' || c > '9')
12662 break;
12663 if ((prec*10) / 10 != prec) {
12664 PyErr_SetString(PyExc_ValueError,
12665 "prec too big");
12666 goto onError;
12667 }
12668 prec = prec*10 + (c - '0');
12669 }
12670 }
12671 } /* prec */
12672 if (fmtcnt >= 0) {
12673 if (c == 'h' || c == 'l' || c == 'L') {
12674 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 }
12677 }
12678 if (fmtcnt < 0) {
12679 PyErr_SetString(PyExc_ValueError,
12680 "incomplete format");
12681 goto onError;
12682 }
12683 if (c != '%') {
12684 v = getnextarg(args, arglen, &argidx);
12685 if (v == NULL)
12686 goto onError;
12687 }
12688 sign = 0;
12689 fill = ' ';
12690 switch (c) {
12691
12692 case '%':
12693 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 len = 1;
12698 break;
12699
12700 case 's':
12701 case 'r':
12702 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012703 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 temp = v;
12705 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 }
12707 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 if (c == 's')
12709 temp = PyObject_Str(v);
12710 else if (c == 'r')
12711 temp = PyObject_Repr(v);
12712 else
12713 temp = PyObject_ASCII(v);
12714 if (temp == NULL)
12715 goto onError;
12716 if (PyUnicode_Check(temp))
12717 /* nothing to do */;
12718 else {
12719 Py_DECREF(temp);
12720 PyErr_SetString(PyExc_TypeError,
12721 "%s argument has non-string str()");
12722 goto onError;
12723 }
12724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 if (PyUnicode_READY(temp) == -1) {
12726 Py_CLEAR(temp);
12727 goto onError;
12728 }
12729 pbuf = PyUnicode_DATA(temp);
12730 kind = PyUnicode_KIND(temp);
12731 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 if (prec >= 0 && len > prec)
12733 len = prec;
12734 break;
12735
12736 case 'i':
12737 case 'd':
12738 case 'u':
12739 case 'o':
12740 case 'x':
12741 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 isnumok = 0;
12743 if (PyNumber_Check(v)) {
12744 PyObject *iobj=NULL;
12745
12746 if (PyLong_Check(v)) {
12747 iobj = v;
12748 Py_INCREF(iobj);
12749 }
12750 else {
12751 iobj = PyNumber_Long(v);
12752 }
12753 if (iobj!=NULL) {
12754 if (PyLong_Check(iobj)) {
12755 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012756 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 Py_DECREF(iobj);
12758 if (!temp)
12759 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 if (PyUnicode_READY(temp) == -1) {
12761 Py_CLEAR(temp);
12762 goto onError;
12763 }
12764 pbuf = PyUnicode_DATA(temp);
12765 kind = PyUnicode_KIND(temp);
12766 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 sign = 1;
12768 }
12769 else {
12770 Py_DECREF(iobj);
12771 }
12772 }
12773 }
12774 if (!isnumok) {
12775 PyErr_Format(PyExc_TypeError,
12776 "%%%c format: a number is required, "
12777 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12778 goto onError;
12779 }
12780 if (flags & F_ZERO)
12781 fill = '0';
12782 break;
12783
12784 case 'e':
12785 case 'E':
12786 case 'f':
12787 case 'F':
12788 case 'g':
12789 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012790 temp = formatfloat(v, flags, prec, c);
12791 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012793 if (PyUnicode_READY(temp) == -1) {
12794 Py_CLEAR(temp);
12795 goto onError;
12796 }
12797 pbuf = PyUnicode_DATA(temp);
12798 kind = PyUnicode_KIND(temp);
12799 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 sign = 1;
12801 if (flags & F_ZERO)
12802 fill = '0';
12803 break;
12804
12805 case 'c':
12806 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012808 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 if (len < 0)
12810 goto onError;
12811 break;
12812
12813 default:
12814 PyErr_Format(PyExc_ValueError,
12815 "unsupported format character '%c' (0x%x) "
12816 "at index %zd",
12817 (31<=c && c<=126) ? (char)c : '?',
12818 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 goto onError;
12821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 /* pbuf is initialized here. */
12823 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012825 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12826 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12827 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 len--;
12829 }
12830 else if (flags & F_SIGN)
12831 sign = '+';
12832 else if (flags & F_BLANK)
12833 sign = ' ';
12834 else
12835 sign = 0;
12836 }
12837 if (width < len)
12838 width = len;
12839 if (rescnt - (sign != 0) < width) {
12840 reslen -= rescnt;
12841 rescnt = width + fmtcnt + 100;
12842 reslen += rescnt;
12843 if (reslen < 0) {
12844 Py_XDECREF(temp);
12845 PyErr_NoMemory();
12846 goto onError;
12847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12849 if (res0 == 0) {
12850 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 Py_XDECREF(temp);
12852 goto onError;
12853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 }
12856 if (sign) {
12857 if (fill != ' ')
12858 *res++ = sign;
12859 rescnt--;
12860 if (width > len)
12861 width--;
12862 }
12863 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12865 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012867 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12868 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 }
12870 rescnt -= 2;
12871 width -= 2;
12872 if (width < 0)
12873 width = 0;
12874 len -= 2;
12875 }
12876 if (width > len && !(flags & F_LJUST)) {
12877 do {
12878 --rescnt;
12879 *res++ = fill;
12880 } while (--width > len);
12881 }
12882 if (fill == ' ') {
12883 if (sign)
12884 *res++ = sign;
12885 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12887 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12888 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12889 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 }
12891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 /* Copy all characters, preserving len */
12893 len1 = len;
12894 while (len1--) {
12895 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12896 rescnt--;
12897 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 while (--width >= len) {
12899 --rescnt;
12900 *res++ = ' ';
12901 }
12902 if (dict && (argidx < arglen) && c != '%') {
12903 PyErr_SetString(PyExc_TypeError,
12904 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012905 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 goto onError;
12907 }
12908 Py_XDECREF(temp);
12909 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910 } /* until end */
12911 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 PyErr_SetString(PyExc_TypeError,
12913 "not all arguments converted during string formatting");
12914 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915 }
12916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917
12918 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12919 if (*res > max)
12920 max = *res;
12921 result = PyUnicode_New(reslen - rescnt, max);
12922 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 kind = PyUnicode_KIND(result);
12925 for (res = res0; res < res0+reslen-rescnt; res++)
12926 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12927 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012928 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930 }
12931 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932 return (PyObject *)result;
12933
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936 Py_DECREF(uformat);
12937 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939 }
12940 return NULL;
12941}
12942
Jeremy Hylton938ace62002-07-17 16:30:39 +000012943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12945
Tim Peters6d6c1a32001-08-02 04:15:00 +000012946static PyObject *
12947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12948{
Benjamin Peterson29060642009-01-31 22:14:21 +000012949 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012950 static char *kwlist[] = {"object", "encoding", "errors", 0};
12951 char *encoding = NULL;
12952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012953
Benjamin Peterson14339b62009-01-31 16:36:08 +000012954 if (type != &PyUnicode_Type)
12955 return unicode_subtype_new(type, args, kwds);
12956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012957 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012958 return NULL;
12959 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012961 if (encoding == NULL && errors == NULL)
12962 return PyObject_Str(x);
12963 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012965}
12966
Guido van Rossume023fe02001-08-30 03:12:59 +000012967static PyObject *
12968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12969{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012970 PyUnicodeObject *unicode, *self;
12971 Py_ssize_t length, char_size;
12972 int share_wstr, share_utf8;
12973 unsigned int kind;
12974 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012975
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012977
12978 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12979 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012980 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012981 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020012982 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012983 return NULL;
12984
12985 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12986 if (self == NULL) {
12987 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 return NULL;
12989 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012990 kind = PyUnicode_KIND(unicode);
12991 length = PyUnicode_GET_LENGTH(unicode);
12992
12993 _PyUnicode_LENGTH(self) = length;
12994 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12995 _PyUnicode_STATE(self).interned = 0;
12996 _PyUnicode_STATE(self).kind = kind;
12997 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020012998 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012999 _PyUnicode_STATE(self).ready = 1;
13000 _PyUnicode_WSTR(self) = NULL;
13001 _PyUnicode_UTF8_LENGTH(self) = 0;
13002 _PyUnicode_UTF8(self) = NULL;
13003 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013004 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013005
13006 share_utf8 = 0;
13007 share_wstr = 0;
13008 if (kind == PyUnicode_1BYTE_KIND) {
13009 char_size = 1;
13010 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13011 share_utf8 = 1;
13012 }
13013 else if (kind == PyUnicode_2BYTE_KIND) {
13014 char_size = 2;
13015 if (sizeof(wchar_t) == 2)
13016 share_wstr = 1;
13017 }
13018 else {
13019 assert(kind == PyUnicode_4BYTE_KIND);
13020 char_size = 4;
13021 if (sizeof(wchar_t) == 4)
13022 share_wstr = 1;
13023 }
13024
13025 /* Ensure we won't overflow the length. */
13026 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13027 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013029 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013030 data = PyObject_MALLOC((length + 1) * char_size);
13031 if (data == NULL) {
13032 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 goto onError;
13034 }
13035
Victor Stinnerc3c74152011-10-02 20:39:55 +020013036 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013037 if (share_utf8) {
13038 _PyUnicode_UTF8_LENGTH(self) = length;
13039 _PyUnicode_UTF8(self) = data;
13040 }
13041 if (share_wstr) {
13042 _PyUnicode_WSTR_LENGTH(self) = length;
13043 _PyUnicode_WSTR(self) = (wchar_t *)data;
13044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013046 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13047 PyUnicode_KIND_SIZE(kind, length + 1));
13048 Py_DECREF(unicode);
13049 return (PyObject *)self;
13050
13051onError:
13052 Py_DECREF(unicode);
13053 Py_DECREF(self);
13054 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013055}
13056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013057PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013059\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013060Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013061encoding defaults to the current default string encoding.\n\
13062errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013063
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013064static PyObject *unicode_iter(PyObject *seq);
13065
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013067 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 "str", /* tp_name */
13069 sizeof(PyUnicodeObject), /* tp_size */
13070 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 (destructor)unicode_dealloc, /* tp_dealloc */
13073 0, /* tp_print */
13074 0, /* tp_getattr */
13075 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013076 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013077 unicode_repr, /* tp_repr */
13078 &unicode_as_number, /* tp_as_number */
13079 &unicode_as_sequence, /* tp_as_sequence */
13080 &unicode_as_mapping, /* tp_as_mapping */
13081 (hashfunc) unicode_hash, /* tp_hash*/
13082 0, /* tp_call*/
13083 (reprfunc) unicode_str, /* tp_str */
13084 PyObject_GenericGetAttr, /* tp_getattro */
13085 0, /* tp_setattro */
13086 0, /* tp_as_buffer */
13087 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013088 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 unicode_doc, /* tp_doc */
13090 0, /* tp_traverse */
13091 0, /* tp_clear */
13092 PyUnicode_RichCompare, /* tp_richcompare */
13093 0, /* tp_weaklistoffset */
13094 unicode_iter, /* tp_iter */
13095 0, /* tp_iternext */
13096 unicode_methods, /* tp_methods */
13097 0, /* tp_members */
13098 0, /* tp_getset */
13099 &PyBaseObject_Type, /* tp_base */
13100 0, /* tp_dict */
13101 0, /* tp_descr_get */
13102 0, /* tp_descr_set */
13103 0, /* tp_dictoffset */
13104 0, /* tp_init */
13105 0, /* tp_alloc */
13106 unicode_new, /* tp_new */
13107 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108};
13109
13110/* Initialize the Unicode implementation */
13111
Thomas Wouters78890102000-07-22 19:25:51 +000013112void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013114 int i;
13115
Thomas Wouters477c8d52006-05-27 19:21:47 +000013116 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013118 0x000A, /* LINE FEED */
13119 0x000D, /* CARRIAGE RETURN */
13120 0x001C, /* FILE SEPARATOR */
13121 0x001D, /* GROUP SEPARATOR */
13122 0x001E, /* RECORD SEPARATOR */
13123 0x0085, /* NEXT LINE */
13124 0x2028, /* LINE SEPARATOR */
13125 0x2029, /* PARAGRAPH SEPARATOR */
13126 };
13127
Fred Drakee4315f52000-05-09 19:53:39 +000013128 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013129 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013130 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013133 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013135 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137
13138 /* initialize the linebreak bloom filter */
13139 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013141 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013142
13143 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
13145
13146/* Finalize the Unicode implementation */
13147
Christian Heimesa156e092008-02-16 07:38:31 +000013148int
13149PyUnicode_ClearFreeList(void)
13150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013152}
13153
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154void
Thomas Wouters78890102000-07-22 19:25:51 +000013155_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013157 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013159 Py_XDECREF(unicode_empty);
13160 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013161
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013162 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 if (unicode_latin1[i]) {
13164 Py_DECREF(unicode_latin1[i]);
13165 unicode_latin1[i] = NULL;
13166 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013167 }
Christian Heimesa156e092008-02-16 07:38:31 +000013168 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013169}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013170
Walter Dörwald16807132007-05-25 13:52:07 +000013171void
13172PyUnicode_InternInPlace(PyObject **p)
13173{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13175 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013176#ifdef Py_DEBUG
13177 assert(s != NULL);
13178 assert(_PyUnicode_CHECK(s));
13179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013181 return;
13182#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183 /* If it's a subclass, we don't really know what putting
13184 it in the interned dict might do. */
13185 if (!PyUnicode_CheckExact(s))
13186 return;
13187 if (PyUnicode_CHECK_INTERNED(s))
13188 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013189 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013190 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 return;
13192 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013193 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 if (interned == NULL) {
13195 interned = PyDict_New();
13196 if (interned == NULL) {
13197 PyErr_Clear(); /* Don't leave an exception */
13198 return;
13199 }
13200 }
13201 /* It might be that the GetItem call fails even
13202 though the key is present in the dictionary,
13203 namely when this happens during a stack overflow. */
13204 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013207
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 if (t) {
13209 Py_INCREF(t);
13210 Py_DECREF(*p);
13211 *p = t;
13212 return;
13213 }
Walter Dörwald16807132007-05-25 13:52:07 +000013214
Benjamin Peterson14339b62009-01-31 16:36:08 +000013215 PyThreadState_GET()->recursion_critical = 1;
13216 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13217 PyErr_Clear();
13218 PyThreadState_GET()->recursion_critical = 0;
13219 return;
13220 }
13221 PyThreadState_GET()->recursion_critical = 0;
13222 /* The two references in interned are not counted by refcnt.
13223 The deallocator will take care of this */
13224 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013226}
13227
13228void
13229PyUnicode_InternImmortal(PyObject **p)
13230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13232
Benjamin Peterson14339b62009-01-31 16:36:08 +000013233 PyUnicode_InternInPlace(p);
13234 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013236 Py_INCREF(*p);
13237 }
Walter Dörwald16807132007-05-25 13:52:07 +000013238}
13239
13240PyObject *
13241PyUnicode_InternFromString(const char *cp)
13242{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013243 PyObject *s = PyUnicode_FromString(cp);
13244 if (s == NULL)
13245 return NULL;
13246 PyUnicode_InternInPlace(&s);
13247 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013248}
13249
Alexander Belopolsky40018472011-02-26 01:02:56 +000013250void
13251_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 PyObject *keys;
13254 PyUnicodeObject *s;
13255 Py_ssize_t i, n;
13256 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013257
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 if (interned == NULL || !PyDict_Check(interned))
13259 return;
13260 keys = PyDict_Keys(interned);
13261 if (keys == NULL || !PyList_Check(keys)) {
13262 PyErr_Clear();
13263 return;
13264 }
Walter Dörwald16807132007-05-25 13:52:07 +000013265
Benjamin Peterson14339b62009-01-31 16:36:08 +000013266 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13267 detector, interned unicode strings are not forcibly deallocated;
13268 rather, we give them their stolen references back, and then clear
13269 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013270
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 n = PyList_GET_SIZE(keys);
13272 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013274 for (i = 0; i < n; i++) {
13275 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013276 if (PyUnicode_READY(s) == -1) {
13277 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013281 case SSTATE_NOT_INTERNED:
13282 /* XXX Shouldn't happen */
13283 break;
13284 case SSTATE_INTERNED_IMMORTAL:
13285 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013287 break;
13288 case SSTATE_INTERNED_MORTAL:
13289 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013291 break;
13292 default:
13293 Py_FatalError("Inconsistent interned string state.");
13294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 }
13297 fprintf(stderr, "total size of all interned strings: "
13298 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13299 "mortal/immortal\n", mortal_size, immortal_size);
13300 Py_DECREF(keys);
13301 PyDict_Clear(interned);
13302 Py_DECREF(interned);
13303 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013304}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013305
13306
13307/********************* Unicode Iterator **************************/
13308
13309typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013310 PyObject_HEAD
13311 Py_ssize_t it_index;
13312 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013313} unicodeiterobject;
13314
13315static void
13316unicodeiter_dealloc(unicodeiterobject *it)
13317{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013318 _PyObject_GC_UNTRACK(it);
13319 Py_XDECREF(it->it_seq);
13320 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013321}
13322
13323static int
13324unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013326 Py_VISIT(it->it_seq);
13327 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013328}
13329
13330static PyObject *
13331unicodeiter_next(unicodeiterobject *it)
13332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013333 PyUnicodeObject *seq;
13334 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013335
Benjamin Peterson14339b62009-01-31 16:36:08 +000013336 assert(it != NULL);
13337 seq = it->it_seq;
13338 if (seq == NULL)
13339 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013340 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013342 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13343 int kind = PyUnicode_KIND(seq);
13344 void *data = PyUnicode_DATA(seq);
13345 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13346 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013347 if (item != NULL)
13348 ++it->it_index;
13349 return item;
13350 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013351
Benjamin Peterson14339b62009-01-31 16:36:08 +000013352 Py_DECREF(seq);
13353 it->it_seq = NULL;
13354 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013355}
13356
13357static PyObject *
13358unicodeiter_len(unicodeiterobject *it)
13359{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013360 Py_ssize_t len = 0;
13361 if (it->it_seq)
13362 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13363 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013364}
13365
13366PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13367
13368static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013371 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013372};
13373
13374PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013375 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13376 "str_iterator", /* tp_name */
13377 sizeof(unicodeiterobject), /* tp_basicsize */
13378 0, /* tp_itemsize */
13379 /* methods */
13380 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13381 0, /* tp_print */
13382 0, /* tp_getattr */
13383 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013384 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013385 0, /* tp_repr */
13386 0, /* tp_as_number */
13387 0, /* tp_as_sequence */
13388 0, /* tp_as_mapping */
13389 0, /* tp_hash */
13390 0, /* tp_call */
13391 0, /* tp_str */
13392 PyObject_GenericGetAttr, /* tp_getattro */
13393 0, /* tp_setattro */
13394 0, /* tp_as_buffer */
13395 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13396 0, /* tp_doc */
13397 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13398 0, /* tp_clear */
13399 0, /* tp_richcompare */
13400 0, /* tp_weaklistoffset */
13401 PyObject_SelfIter, /* tp_iter */
13402 (iternextfunc)unicodeiter_next, /* tp_iternext */
13403 unicodeiter_methods, /* tp_methods */
13404 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013405};
13406
13407static PyObject *
13408unicode_iter(PyObject *seq)
13409{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013410 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013411
Benjamin Peterson14339b62009-01-31 16:36:08 +000013412 if (!PyUnicode_Check(seq)) {
13413 PyErr_BadInternalCall();
13414 return NULL;
13415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 if (PyUnicode_READY(seq) == -1)
13417 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13419 if (it == NULL)
13420 return NULL;
13421 it->it_index = 0;
13422 Py_INCREF(seq);
13423 it->it_seq = (PyUnicodeObject *)seq;
13424 _PyObject_GC_TRACK(it);
13425 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013426}
13427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428#define UNIOP(x) Py_UNICODE_##x
13429#define UNIOP_t Py_UNICODE
13430#include "uniops.h"
13431#undef UNIOP
13432#undef UNIOP_t
13433#define UNIOP(x) Py_UCS4_##x
13434#define UNIOP_t Py_UCS4
13435#include "uniops.h"
13436#undef UNIOP
13437#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013438
Victor Stinner71133ff2010-09-01 23:43:53 +000013439Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013440PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013441{
13442 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13443 Py_UNICODE *copy;
13444 Py_ssize_t size;
13445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 if (!PyUnicode_Check(unicode)) {
13447 PyErr_BadArgument();
13448 return NULL;
13449 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013450 /* Ensure we won't overflow the size. */
13451 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13452 PyErr_NoMemory();
13453 return NULL;
13454 }
13455 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13456 size *= sizeof(Py_UNICODE);
13457 copy = PyMem_Malloc(size);
13458 if (copy == NULL) {
13459 PyErr_NoMemory();
13460 return NULL;
13461 }
13462 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13463 return copy;
13464}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013465
Georg Brandl66c221e2010-10-14 07:04:07 +000013466/* A _string module, to export formatter_parser and formatter_field_name_split
13467 to the string.Formatter class implemented in Python. */
13468
13469static PyMethodDef _string_methods[] = {
13470 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13471 METH_O, PyDoc_STR("split the argument as a field name")},
13472 {"formatter_parser", (PyCFunction) formatter_parser,
13473 METH_O, PyDoc_STR("parse the argument as a format string")},
13474 {NULL, NULL}
13475};
13476
13477static struct PyModuleDef _string_module = {
13478 PyModuleDef_HEAD_INIT,
13479 "_string",
13480 PyDoc_STR("string helper module"),
13481 0,
13482 _string_methods,
13483 NULL,
13484 NULL,
13485 NULL,
13486 NULL
13487};
13488
13489PyMODINIT_FUNC
13490PyInit__string(void)
13491{
13492 return PyModule_Create(&_string_module);
13493}
13494
13495
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013496#ifdef __cplusplus
13497}
13498#endif