blob: c60e32452d305d35330905a2fbbcb7d916da49e0 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinnerc379ead2011-10-03 12:52:27 +0200133#define _PyUnicode_SHARE_UTF8(op) \
134 (assert(_PyUnicode_CHECK(op)), \
135 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
136 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
137#define _PyUnicode_SHARE_WSTR(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
140
Victor Stinner829c0ad2011-10-03 01:08:02 +0200141/* true if the Unicode object has an allocated UTF-8 memory block
142 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200143#define _PyUnicode_HAS_UTF8_MEMORY(op) \
144 (assert(_PyUnicode_CHECK(op)), \
145 (!PyUnicode_IS_COMPACT_ASCII(op) \
146 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200147 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
148
Victor Stinner910337b2011-10-03 03:20:16 +0200149/* Generic helper macro to convert characters of different types.
150 from_type and to_type have to be valid type names, begin and end
151 are pointers to the source characters which should be of type
152 "from_type *". to is a pointer of type "to_type *" and points to the
153 buffer where the result characters are written to. */
154#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
155 do { \
156 const from_type *iter_; to_type *to_; \
157 for (iter_ = (begin), to_ = (to_type *)(to); \
158 iter_ < (end); \
159 ++iter_, ++to_) { \
160 *to_ = (to_type)*iter_; \
161 } \
162 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200164/* The Unicode string has been modified: reset the hash */
165#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
175static PyObject *interned;
176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200178static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000179
180/* Single character Unicode strings in the Latin-1 range are being
181 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Christian Heimes190d79e2008-01-30 11:58:22 +0000184/* Fast detection of the most frequent whitespace characters */
185const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000186 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000187/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000188/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000189/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000190/* case 0x000C: * FORM FEED */
191/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 0, 1, 1, 1, 1, 1, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x001C: * FILE SEPARATOR */
195/* case 0x001D: * GROUP SEPARATOR */
196/* case 0x001E: * RECORD SEPARATOR */
197/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000198 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000199/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000200 1, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000204
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000213};
214
Victor Stinnerfe226c02011-10-03 03:52:20 +0200215static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
216
Alexander Belopolsky40018472011-02-26 01:02:56 +0000217static PyObject *
218unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000219 PyObject **errorHandler,const char *encoding, const char *reason,
220 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
221 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
222
Alexander Belopolsky40018472011-02-26 01:02:56 +0000223static void
224raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300225 const char *encoding,
226 const Py_UNICODE *unicode, Py_ssize_t size,
227 Py_ssize_t startpos, Py_ssize_t endpos,
228 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000229
Christian Heimes190d79e2008-01-30 11:58:22 +0000230/* Same for linebreaks */
231static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000232 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000233/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000234/* 0x000B, * LINE TABULATION */
235/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000236/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000237 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000239/* 0x001C, * FILE SEPARATOR */
240/* 0x001D, * GROUP SEPARATOR */
241/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000242 0, 0, 0, 0, 1, 1, 1, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000247
Benjamin Peterson14339b62009-01-31 16:36:08 +0000248 0, 0, 0, 0, 0, 0, 0, 0,
249 0, 0, 0, 0, 0, 0, 0, 0,
250 0, 0, 0, 0, 0, 0, 0, 0,
251 0, 0, 0, 0, 0, 0, 0, 0,
252 0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000256};
257
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300258/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
259 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000260Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000261PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000262{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000263#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000265#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000266 /* This is actually an illegal character, so it should
267 not be passed to unichr. */
268 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000269#endif
270}
271
Victor Stinner910337b2011-10-03 03:20:16 +0200272#ifdef Py_DEBUG
273static int
274_PyUnicode_CheckConsistency(void *op)
275{
276 PyASCIIObject *ascii;
277 unsigned int kind;
278
279 assert(PyUnicode_Check(op));
280
281 ascii = (PyASCIIObject *)op;
282 kind = ascii->state.kind;
283
284 if (ascii->state.ascii == 1) {
285 assert(kind == PyUnicode_1BYTE_KIND);
286 assert(ascii->state.compact == 1);
287 assert(ascii->state.ready == 1);
288 }
289 else if (ascii->state.compact == 1) {
290 assert(kind == PyUnicode_1BYTE_KIND
291 || kind == PyUnicode_2BYTE_KIND
292 || kind == PyUnicode_4BYTE_KIND);
293 assert(ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(ascii->state.ready == 1);
296 } else {
297 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
298 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
299
300 if (kind == PyUnicode_WCHAR_KIND) {
301 assert(!ascii->state.compact == 1);
302 assert(ascii->state.ascii == 0);
303 assert(!ascii->state.ready == 1);
304 assert(ascii->wstr != NULL);
305 assert(unicode->data.any == NULL);
306 assert(compact->utf8 == NULL);
307 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
308 }
309 else {
310 assert(kind == PyUnicode_1BYTE_KIND
311 || kind == PyUnicode_2BYTE_KIND
312 || kind == PyUnicode_4BYTE_KIND);
313 assert(!ascii->state.compact == 1);
314 assert(ascii->state.ready == 1);
315 assert(unicode->data.any != NULL);
316 assert(ascii->state.ascii == 0);
317 }
318 }
319 return 1;
320}
321#endif
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323/* --- Bloom Filters ----------------------------------------------------- */
324
325/* stuff to implement simple "bloom filters" for Unicode characters.
326 to keep things simple, we use a single bitmask, using the least 5
327 bits from each unicode characters as the bit index. */
328
329/* the linebreak mask is set up by Unicode_Init below */
330
Antoine Pitrouf068f942010-01-13 14:19:12 +0000331#if LONG_BIT >= 128
332#define BLOOM_WIDTH 128
333#elif LONG_BIT >= 64
334#define BLOOM_WIDTH 64
335#elif LONG_BIT >= 32
336#define BLOOM_WIDTH 32
337#else
338#error "LONG_BIT is smaller than 32"
339#endif
340
Thomas Wouters477c8d52006-05-27 19:21:47 +0000341#define BLOOM_MASK unsigned long
342
343static BLOOM_MASK bloom_linebreak;
344
Antoine Pitrouf068f942010-01-13 14:19:12 +0000345#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
346#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000347
Benjamin Peterson29060642009-01-31 22:14:21 +0000348#define BLOOM_LINEBREAK(ch) \
349 ((ch) < 128U ? ascii_linebreak[(ch)] : \
350 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000351
Alexander Belopolsky40018472011-02-26 01:02:56 +0000352Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200353make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000354{
355 /* calculate simple bloom-style bitmask for a given unicode string */
356
Antoine Pitrouf068f942010-01-13 14:19:12 +0000357 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000358 Py_ssize_t i;
359
360 mask = 0;
361 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200362 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000363
364 return mask;
365}
366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200367#define BLOOM_MEMBER(mask, chr, str) \
368 (BLOOM(mask, chr) \
369 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000370
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371/* --- Unicode Object ----------------------------------------------------- */
372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200373static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200374fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
375
376Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
377 Py_ssize_t size, Py_UCS4 ch,
378 int direction)
379{
380 /* like wcschr, but doesn't stop at NULL characters */
381 Py_ssize_t i;
382 if (direction == 1) {
383 for(i = 0; i < size; i++)
384 if (PyUnicode_READ(kind, s, i) == ch)
385 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
386 }
387 else {
388 for(i = size-1; i >= 0; i--)
389 if (PyUnicode_READ(kind, s, i) == ch)
390 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
391 }
392 return NULL;
393}
394
Victor Stinnerfe226c02011-10-03 03:52:20 +0200395static PyObject*
396resize_compact(PyObject *unicode, Py_ssize_t length)
397{
398 Py_ssize_t char_size;
399 Py_ssize_t struct_size;
400 Py_ssize_t new_size;
401 int share_wstr;
402
403 assert(PyUnicode_IS_READY(unicode));
404 char_size = PyUnicode_CHARACTER_SIZE(unicode);
405 if (PyUnicode_IS_COMPACT_ASCII(unicode))
406 struct_size = sizeof(PyASCIIObject);
407 else
408 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200409 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200410
411 _Py_DEC_REFTOTAL;
412 _Py_ForgetReference(unicode);
413
414 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
415 PyErr_NoMemory();
416 return NULL;
417 }
418 new_size = (struct_size + (length + 1) * char_size);
419
420 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
421 if (unicode == NULL) {
422 PyObject_Del(unicode);
423 PyErr_NoMemory();
424 return NULL;
425 }
426 _Py_NewReference(unicode);
427 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200428 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200429 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200430 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
431 _PyUnicode_WSTR_LENGTH(unicode) = length;
432 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200433 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
434 length, 0);
435 return unicode;
436}
437
Alexander Belopolsky40018472011-02-26 01:02:56 +0000438static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200439resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440{
441 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200443 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200444
Victor Stinnerfe226c02011-10-03 03:52:20 +0200445 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200446 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000447
Victor Stinnerfe226c02011-10-03 03:52:20 +0200448 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
449 {
450 PyObject_DEL(_PyUnicode_UTF8(unicode));
451 _PyUnicode_UTF8(unicode) = NULL;
452 }
453
454 if (PyUnicode_IS_READY(unicode)) {
455 Py_ssize_t char_size;
456 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200457 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200458 void *data;
459
460 data = _PyUnicode_DATA_ANY(unicode);
461 assert(data != NULL);
462 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200463 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
464 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200465
466 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
467 PyErr_NoMemory();
468 return -1;
469 }
470 new_size = (length + 1) * char_size;
471
472 data = (PyObject *)PyObject_REALLOC(data, new_size);
473 if (data == NULL) {
474 PyErr_NoMemory();
475 return -1;
476 }
477 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200478 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200479 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200480 _PyUnicode_WSTR_LENGTH(unicode) = length;
481 }
482 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200483 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200484 _PyUnicode_UTF8_LENGTH(unicode) = length;
485 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200486 _PyUnicode_LENGTH(unicode) = length;
487 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
488 if (share_wstr)
489 return 0;
490 }
491 if (_PyUnicode_WSTR(unicode) != NULL) {
492 assert(_PyUnicode_WSTR(unicode) != NULL);
493
494 oldstr = _PyUnicode_WSTR(unicode);
495 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
496 sizeof(Py_UNICODE) * (length + 1));
497 if (!_PyUnicode_WSTR(unicode)) {
498 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
499 PyErr_NoMemory();
500 return -1;
501 }
502 _PyUnicode_WSTR(unicode)[length] = 0;
503 _PyUnicode_WSTR_LENGTH(unicode) = length;
504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 return 0;
506}
507
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508static PyObject*
509resize_copy(PyObject *unicode, Py_ssize_t length)
510{
511 Py_ssize_t copy_length;
512 if (PyUnicode_IS_COMPACT(unicode)) {
513 PyObject *copy;
514 assert(PyUnicode_IS_READY(unicode));
515
516 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
517 if (copy == NULL)
518 return NULL;
519
520 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
521 if (PyUnicode_CopyCharacters(copy, 0,
522 unicode, 0,
523 copy_length) < 0)
524 {
525 Py_DECREF(copy);
526 return NULL;
527 }
528 return copy;
529 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200530 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200531 assert(_PyUnicode_WSTR(unicode) != NULL);
532 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200533 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200534 if (w == NULL)
535 return NULL;
536 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
537 copy_length = Py_MIN(copy_length, length);
538 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
539 copy_length);
540 return (PyObject*)w;
541 }
542}
543
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000545 Ux0000 terminated; some code (e.g. new_identifier)
546 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547
548 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000549 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551*/
552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553#ifdef Py_DEBUG
554int unicode_old_new_calls = 0;
555#endif
556
Alexander Belopolsky40018472011-02-26 01:02:56 +0000557static PyUnicodeObject *
558_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559{
560 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562
Thomas Wouters477c8d52006-05-27 19:21:47 +0000563 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564 if (length == 0 && unicode_empty != NULL) {
565 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200566 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 }
568
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000569 /* Ensure we won't overflow the size. */
570 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
571 return (PyUnicodeObject *)PyErr_NoMemory();
572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200573 if (length < 0) {
574 PyErr_SetString(PyExc_SystemError,
575 "Negative size passed to _PyUnicode_New");
576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 }
578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579#ifdef Py_DEBUG
580 ++unicode_old_new_calls;
581#endif
582
583 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
584 if (unicode == NULL)
585 return NULL;
586 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
587 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
588 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000589 PyErr_NoMemory();
590 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592
Jeremy Hyltond8082792003-09-16 19:41:39 +0000593 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000594 * the caller fails before initializing str -- unicode_resize()
595 * reads str[0], and the Keep-Alive optimization can keep memory
596 * allocated for str alive across a call to unicode_dealloc(unicode).
597 * We don't want unicode_resize to read uninitialized memory in
598 * that case.
599 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200600 _PyUnicode_WSTR(unicode)[0] = 0;
601 _PyUnicode_WSTR(unicode)[length] = 0;
602 _PyUnicode_WSTR_LENGTH(unicode) = length;
603 _PyUnicode_HASH(unicode) = -1;
604 _PyUnicode_STATE(unicode).interned = 0;
605 _PyUnicode_STATE(unicode).kind = 0;
606 _PyUnicode_STATE(unicode).compact = 0;
607 _PyUnicode_STATE(unicode).ready = 0;
608 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200609 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200611 _PyUnicode_UTF8(unicode) = NULL;
612 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000614
Benjamin Peterson29060642009-01-31 22:14:21 +0000615 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000616 /* XXX UNREF/NEWREF interface should be more symmetrical */
617 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000618 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000619 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621}
622
Victor Stinnerf42dc442011-10-02 23:33:16 +0200623static const char*
624unicode_kind_name(PyObject *unicode)
625{
Victor Stinner910337b2011-10-03 03:20:16 +0200626 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200627 if (!PyUnicode_IS_COMPACT(unicode))
628 {
629 if (!PyUnicode_IS_READY(unicode))
630 return "wstr";
631 switch(PyUnicode_KIND(unicode))
632 {
633 case PyUnicode_1BYTE_KIND:
634 if (PyUnicode_IS_COMPACT_ASCII(unicode))
635 return "legacy ascii";
636 else
637 return "legacy latin1";
638 case PyUnicode_2BYTE_KIND:
639 return "legacy UCS2";
640 case PyUnicode_4BYTE_KIND:
641 return "legacy UCS4";
642 default:
643 return "<legacy invalid kind>";
644 }
645 }
646 assert(PyUnicode_IS_READY(unicode));
647 switch(PyUnicode_KIND(unicode))
648 {
649 case PyUnicode_1BYTE_KIND:
650 if (PyUnicode_IS_COMPACT_ASCII(unicode))
651 return "ascii";
652 else
653 return "compact latin1";
654 case PyUnicode_2BYTE_KIND:
655 return "compact UCS2";
656 case PyUnicode_4BYTE_KIND:
657 return "compact UCS4";
658 default:
659 return "<invalid compact kind>";
660 }
661}
662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifdef Py_DEBUG
664int unicode_new_new_calls = 0;
665
666/* Functions wrapping macros for use in debugger */
667char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200668 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200669}
670
671void *_PyUnicode_compact_data(void *unicode) {
672 return _PyUnicode_COMPACT_DATA(unicode);
673}
674void *_PyUnicode_data(void *unicode){
675 printf("obj %p\n", unicode);
676 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
677 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
678 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
679 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
680 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
681 return PyUnicode_DATA(unicode);
682}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200683
684void
685_PyUnicode_Dump(PyObject *op)
686{
687 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200688 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
689 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
690 void *data;
691 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
692 if (ascii->state.compact)
693 data = (compact + 1);
694 else
695 data = unicode->data.any;
696 if (ascii->wstr == data)
697 printf("shared ");
698 printf("wstr=%p", ascii->wstr);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200699 if (!ascii->state.ascii) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200700 printf(" (%zu), ", compact->wstr_length);
701 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
702 printf("shared ");
703 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200704 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200705 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200706}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707#endif
708
709PyObject *
710PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
711{
712 PyObject *obj;
713 PyCompactUnicodeObject *unicode;
714 void *data;
715 int kind_state;
716 int is_sharing = 0, is_ascii = 0;
717 Py_ssize_t char_size;
718 Py_ssize_t struct_size;
719
720 /* Optimization for empty strings */
721 if (size == 0 && unicode_empty != NULL) {
722 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200723 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724 }
725
726#ifdef Py_DEBUG
727 ++unicode_new_new_calls;
728#endif
729
730 struct_size = sizeof(PyCompactUnicodeObject);
731 if (maxchar < 128) {
732 kind_state = PyUnicode_1BYTE_KIND;
733 char_size = 1;
734 is_ascii = 1;
735 struct_size = sizeof(PyASCIIObject);
736 }
737 else if (maxchar < 256) {
738 kind_state = PyUnicode_1BYTE_KIND;
739 char_size = 1;
740 }
741 else if (maxchar < 65536) {
742 kind_state = PyUnicode_2BYTE_KIND;
743 char_size = 2;
744 if (sizeof(wchar_t) == 2)
745 is_sharing = 1;
746 }
747 else {
748 kind_state = PyUnicode_4BYTE_KIND;
749 char_size = 4;
750 if (sizeof(wchar_t) == 4)
751 is_sharing = 1;
752 }
753
754 /* Ensure we won't overflow the size. */
755 if (size < 0) {
756 PyErr_SetString(PyExc_SystemError,
757 "Negative size passed to PyUnicode_New");
758 return NULL;
759 }
760 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
761 return PyErr_NoMemory();
762
763 /* Duplicated allocation code from _PyObject_New() instead of a call to
764 * PyObject_New() so we are able to allocate space for the object and
765 * it's data buffer.
766 */
767 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
768 if (obj == NULL)
769 return PyErr_NoMemory();
770 obj = PyObject_INIT(obj, &PyUnicode_Type);
771 if (obj == NULL)
772 return NULL;
773
774 unicode = (PyCompactUnicodeObject *)obj;
775 if (is_ascii)
776 data = ((PyASCIIObject*)obj) + 1;
777 else
778 data = unicode + 1;
779 _PyUnicode_LENGTH(unicode) = size;
780 _PyUnicode_HASH(unicode) = -1;
781 _PyUnicode_STATE(unicode).interned = 0;
782 _PyUnicode_STATE(unicode).kind = kind_state;
783 _PyUnicode_STATE(unicode).compact = 1;
784 _PyUnicode_STATE(unicode).ready = 1;
785 _PyUnicode_STATE(unicode).ascii = is_ascii;
786 if (is_ascii) {
787 ((char*)data)[size] = 0;
788 _PyUnicode_WSTR(unicode) = NULL;
789 }
790 else if (kind_state == PyUnicode_1BYTE_KIND) {
791 ((char*)data)[size] = 0;
792 _PyUnicode_WSTR(unicode) = NULL;
793 _PyUnicode_WSTR_LENGTH(unicode) = 0;
794 unicode->utf8_length = 0;
795 unicode->utf8 = NULL;
796 }
797 else {
798 unicode->utf8 = NULL;
799 if (kind_state == PyUnicode_2BYTE_KIND)
800 ((Py_UCS2*)data)[size] = 0;
801 else /* kind_state == PyUnicode_4BYTE_KIND */
802 ((Py_UCS4*)data)[size] = 0;
803 if (is_sharing) {
804 _PyUnicode_WSTR_LENGTH(unicode) = size;
805 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
806 }
807 else {
808 _PyUnicode_WSTR_LENGTH(unicode) = 0;
809 _PyUnicode_WSTR(unicode) = NULL;
810 }
811 }
812 return obj;
813}
814
815#if SIZEOF_WCHAR_T == 2
816/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
817 will decode surrogate pairs, the other conversions are implemented as macros
818 for efficency.
819
820 This function assumes that unicode can hold one more code point than wstr
821 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200822static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200823unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
824 PyUnicodeObject *unicode)
825{
826 const wchar_t *iter;
827 Py_UCS4 *ucs4_out;
828
Victor Stinner910337b2011-10-03 03:20:16 +0200829 assert(unicode != NULL);
830 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
832 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
833
834 for (iter = begin; iter < end; ) {
835 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
836 _PyUnicode_GET_LENGTH(unicode)));
837 if (*iter >= 0xD800 && *iter <= 0xDBFF
838 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
839 {
840 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
841 iter += 2;
842 }
843 else {
844 *ucs4_out++ = *iter;
845 iter++;
846 }
847 }
848 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
849 _PyUnicode_GET_LENGTH(unicode)));
850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851}
852#endif
853
Victor Stinnercd9950f2011-10-02 00:34:53 +0200854static int
855_PyUnicode_Dirty(PyObject *unicode)
856{
Victor Stinner910337b2011-10-03 03:20:16 +0200857 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200858 if (Py_REFCNT(unicode) != 1) {
859 PyErr_SetString(PyExc_ValueError,
860 "Cannot modify a string having more than 1 reference");
861 return -1;
862 }
863 _PyUnicode_DIRTY(unicode);
864 return 0;
865}
866
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200867Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
869 PyObject *from, Py_ssize_t from_start,
870 Py_ssize_t how_many)
871{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200872 unsigned int from_kind, to_kind;
873 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200874
Victor Stinnerb1536152011-09-30 02:26:10 +0200875 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
876 PyErr_BadInternalCall();
877 return -1;
878 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879
880 if (PyUnicode_READY(from))
881 return -1;
882 if (PyUnicode_READY(to))
883 return -1;
884
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200885 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200886 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
887 PyErr_Format(PyExc_ValueError,
888 "Cannot write %zi characters at %zi "
889 "in a string of %zi characters",
890 how_many, to_start, PyUnicode_GET_LENGTH(to));
891 return -1;
892 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200893 if (how_many == 0)
894 return 0;
895
Victor Stinnercd9950f2011-10-02 00:34:53 +0200896 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200897 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200900 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200902 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903
Victor Stinnerf42dc442011-10-02 23:33:16 +0200904 if (from_kind == to_kind
905 /* deny latin1 => ascii */
906 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
907 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200908 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200909 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200910 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200911 + PyUnicode_KIND_SIZE(from_kind, from_start),
912 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200913 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200914 else if (from_kind == PyUnicode_1BYTE_KIND
915 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200916 {
917 _PyUnicode_CONVERT_BYTES(
918 Py_UCS1, Py_UCS2,
919 PyUnicode_1BYTE_DATA(from) + from_start,
920 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
921 PyUnicode_2BYTE_DATA(to) + to_start
922 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200923 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200924 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200925 && to_kind == PyUnicode_4BYTE_KIND)
926 {
927 _PyUnicode_CONVERT_BYTES(
928 Py_UCS1, Py_UCS4,
929 PyUnicode_1BYTE_DATA(from) + from_start,
930 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
931 PyUnicode_4BYTE_DATA(to) + to_start
932 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200933 }
934 else if (from_kind == PyUnicode_2BYTE_KIND
935 && to_kind == PyUnicode_4BYTE_KIND)
936 {
937 _PyUnicode_CONVERT_BYTES(
938 Py_UCS2, Py_UCS4,
939 PyUnicode_2BYTE_DATA(from) + from_start,
940 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
941 PyUnicode_4BYTE_DATA(to) + to_start
942 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200943 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200944 else {
945 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946
947 /* check if max_char(from substring) <= max_char(to) */
948 if (from_kind > to_kind
949 /* latin1 => ascii */
950 || (PyUnicode_IS_COMPACT_ASCII(to)
951 && to_kind == PyUnicode_1BYTE_KIND
952 && !PyUnicode_IS_COMPACT_ASCII(from)))
953 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200954 /* slow path to check for character overflow */
955 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
956 Py_UCS4 ch, maxchar;
957 Py_ssize_t i;
958
959 maxchar = 0;
960 invalid_kinds = 0;
961 for (i=0; i < how_many; i++) {
962 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
963 if (ch > maxchar) {
964 maxchar = ch;
965 if (maxchar > to_maxchar) {
966 invalid_kinds = 1;
967 break;
968 }
969 }
970 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
971 }
972 }
973 else
974 invalid_kinds = 1;
975 if (invalid_kinds) {
976 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200977 "Cannot copy %s characters "
978 "into a string of %s characters",
979 unicode_kind_name(from),
980 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200981 return -1;
982 }
983 }
984 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinner17222162011-09-28 22:15:37 +0200987/* Find the maximum code point and count the number of surrogate pairs so a
988 correct string length can be computed before converting a string to UCS4.
989 This function counts single surrogates as a character and not as a pair.
990
991 Return 0 on success, or -1 on error. */
992static int
993find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
994 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995{
996 const wchar_t *iter;
997
Victor Stinnerc53be962011-10-02 21:33:54 +0200998 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999 if (num_surrogates == NULL || maxchar == NULL) {
1000 PyErr_SetString(PyExc_SystemError,
1001 "unexpected NULL arguments to "
1002 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1003 return -1;
1004 }
1005
1006 *num_surrogates = 0;
1007 *maxchar = 0;
1008
1009 for (iter = begin; iter < end; ) {
1010 if (*iter > *maxchar)
1011 *maxchar = *iter;
1012#if SIZEOF_WCHAR_T == 2
1013 if (*iter >= 0xD800 && *iter <= 0xDBFF
1014 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1015 {
1016 Py_UCS4 surrogate_val;
1017 surrogate_val = (((iter[0] & 0x3FF)<<10)
1018 | (iter[1] & 0x3FF)) + 0x10000;
1019 ++(*num_surrogates);
1020 if (surrogate_val > *maxchar)
1021 *maxchar = surrogate_val;
1022 iter += 2;
1023 }
1024 else
1025 iter++;
1026#else
1027 iter++;
1028#endif
1029 }
1030 return 0;
1031}
1032
1033#ifdef Py_DEBUG
1034int unicode_ready_calls = 0;
1035#endif
1036
1037int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001038_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001039{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001040 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 wchar_t *end;
1042 Py_UCS4 maxchar = 0;
1043 Py_ssize_t num_surrogates;
1044#if SIZEOF_WCHAR_T == 2
1045 Py_ssize_t length_wo_surrogates;
1046#endif
1047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001049 strings were created using _PyObject_New() and where no canonical
1050 representation (the str field) has been set yet aka strings
1051 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001052 assert(_PyUnicode_CHECK(unicode));
1053 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001055 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001056 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001057 /* Actually, it should neither be interned nor be anything else: */
1058 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059
1060#ifdef Py_DEBUG
1061 ++unicode_ready_calls;
1062#endif
1063
1064 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001065 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001066 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068
1069 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001070 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1071 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 PyErr_NoMemory();
1073 return -1;
1074 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001075 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001076 _PyUnicode_WSTR(unicode), end,
1077 PyUnicode_1BYTE_DATA(unicode));
1078 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1079 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1080 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1081 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001082 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001083 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 }
1085 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001086 _PyUnicode_UTF8(unicode) = NULL;
1087 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 }
1089 PyObject_FREE(_PyUnicode_WSTR(unicode));
1090 _PyUnicode_WSTR(unicode) = NULL;
1091 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1092 }
1093 /* In this case we might have to convert down from 4-byte native
1094 wchar_t to 2-byte unicode. */
1095 else if (maxchar < 65536) {
1096 assert(num_surrogates == 0 &&
1097 "FindMaxCharAndNumSurrogatePairs() messed up");
1098
Victor Stinner506f5922011-09-28 22:34:18 +02001099#if SIZEOF_WCHAR_T == 2
1100 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001101 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001102 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1103 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1104 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001105 _PyUnicode_UTF8(unicode) = NULL;
1106 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001107#else
1108 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001109 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001110 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001111 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001112 PyErr_NoMemory();
1113 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 }
Victor Stinner506f5922011-09-28 22:34:18 +02001115 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1116 _PyUnicode_WSTR(unicode), end,
1117 PyUnicode_2BYTE_DATA(unicode));
1118 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1119 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1120 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001121 _PyUnicode_UTF8(unicode) = NULL;
1122 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001123 PyObject_FREE(_PyUnicode_WSTR(unicode));
1124 _PyUnicode_WSTR(unicode) = NULL;
1125 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1126#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 }
1128 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1129 else {
1130#if SIZEOF_WCHAR_T == 2
1131 /* in case the native representation is 2-bytes, we need to allocate a
1132 new normalized 4-byte version. */
1133 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001134 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1135 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001136 PyErr_NoMemory();
1137 return -1;
1138 }
1139 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1140 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001141 _PyUnicode_UTF8(unicode) = NULL;
1142 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001143 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1144 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001145 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 PyObject_FREE(_PyUnicode_WSTR(unicode));
1147 _PyUnicode_WSTR(unicode) = NULL;
1148 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1149#else
1150 assert(num_surrogates == 0);
1151
Victor Stinnerc3c74152011-10-02 20:39:55 +02001152 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001154 _PyUnicode_UTF8(unicode) = NULL;
1155 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1157#endif
1158 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1159 }
1160 _PyUnicode_STATE(unicode).ready = 1;
1161 return 0;
1162}
1163
Alexander Belopolsky40018472011-02-26 01:02:56 +00001164static void
1165unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166{
Walter Dörwald16807132007-05-25 13:52:07 +00001167 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001168 case SSTATE_NOT_INTERNED:
1169 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001170
Benjamin Peterson29060642009-01-31 22:14:21 +00001171 case SSTATE_INTERNED_MORTAL:
1172 /* revive dead object temporarily for DelItem */
1173 Py_REFCNT(unicode) = 3;
1174 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1175 Py_FatalError(
1176 "deletion of interned string failed");
1177 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001178
Benjamin Peterson29060642009-01-31 22:14:21 +00001179 case SSTATE_INTERNED_IMMORTAL:
1180 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001181
Benjamin Peterson29060642009-01-31 22:14:21 +00001182 default:
1183 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001184 }
1185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 if (_PyUnicode_WSTR(unicode) &&
1187 (!PyUnicode_IS_READY(unicode) ||
1188 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1189 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001190 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001191 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192
1193 if (PyUnicode_IS_COMPACT(unicode)) {
1194 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 }
1196 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001197 if (_PyUnicode_DATA_ANY(unicode))
1198 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001199 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 }
1201}
1202
Alexander Belopolsky40018472011-02-26 01:02:56 +00001203static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001204unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001205{
Victor Stinnera3be6132011-10-03 02:16:37 +02001206 Py_ssize_t len;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001207 if (Py_REFCNT(unicode) != 1)
1208 return 0;
1209 if (PyUnicode_CHECK_INTERNED(unicode))
1210 return 0;
1211 if (unicode == unicode_empty)
1212 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001213 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1214 len = PyUnicode_WSTR_LENGTH(unicode);
1215 else
1216 len = PyUnicode_GET_LENGTH(unicode);
1217 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001218 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001219 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001220 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001221 else
1222 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001223 if (ch < 256 && unicode_latin1[ch] == unicode)
1224 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001225 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001226 return 1;
1227}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001228
Victor Stinnerfe226c02011-10-03 03:52:20 +02001229static int
1230unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1231{
1232 PyObject *unicode;
1233 Py_ssize_t old_length;
1234
1235 assert(p_unicode != NULL);
1236 unicode = *p_unicode;
1237
1238 assert(unicode != NULL);
1239 assert(PyUnicode_Check(unicode));
1240 assert(0 <= length);
1241
Victor Stinner910337b2011-10-03 03:20:16 +02001242 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001243 old_length = PyUnicode_WSTR_LENGTH(unicode);
1244 else
1245 old_length = PyUnicode_GET_LENGTH(unicode);
1246 if (old_length == length)
1247 return 0;
1248
1249 /* FIXME: really create a new object? */
1250 if (!unicode_resizable(unicode)) {
1251 PyObject *copy = resize_copy(unicode, length);
1252 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001254 Py_DECREF(*p_unicode);
1255 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001256 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001257 }
1258
Victor Stinnerfe226c02011-10-03 03:52:20 +02001259 if (PyUnicode_IS_COMPACT(unicode)) {
1260 *p_unicode = resize_compact(unicode, length);
1261 if (*p_unicode == NULL)
1262 return -1;
1263 return 0;
1264 } else
1265 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001266}
1267
Alexander Belopolsky40018472011-02-26 01:02:56 +00001268int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001269PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001270{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001271 PyObject *unicode;
1272 if (p_unicode == NULL) {
1273 PyErr_BadInternalCall();
1274 return -1;
1275 }
1276 unicode = *p_unicode;
1277 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1278 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1279 {
1280 PyErr_BadInternalCall();
1281 return -1;
1282 }
1283 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001284}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001286static PyObject*
1287get_latin1_char(unsigned char ch)
1288{
Victor Stinnera464fc12011-10-02 20:39:30 +02001289 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001291 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292 if (!unicode)
1293 return NULL;
1294 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1295 unicode_latin1[ch] = unicode;
1296 }
1297 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001298 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299}
1300
Alexander Belopolsky40018472011-02-26 01:02:56 +00001301PyObject *
1302PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 Py_UCS4 maxchar = 0;
1306 Py_ssize_t num_surrogates;
1307
1308 if (u == NULL)
1309 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001311 /* If the Unicode data is known at construction time, we can apply
1312 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 /* Optimization for empty strings */
1315 if (size == 0 && unicode_empty != NULL) {
1316 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001317 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001318 }
Tim Petersced69f82003-09-16 20:30:58 +00001319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 /* Single character Unicode objects in the Latin-1 range are
1321 shared when using this constructor */
1322 if (size == 1 && *u < 256)
1323 return get_latin1_char((unsigned char)*u);
1324
1325 /* If not empty and not single character, copy the Unicode data
1326 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001327 if (find_maxchar_surrogates(u, u + size,
1328 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329 return NULL;
1330
1331 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1332 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 if (!unicode)
1334 return NULL;
1335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 switch (PyUnicode_KIND(unicode)) {
1337 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001338 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1340 break;
1341 case PyUnicode_2BYTE_KIND:
1342#if Py_UNICODE_SIZE == 2
1343 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1344#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001345 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1347#endif
1348 break;
1349 case PyUnicode_4BYTE_KIND:
1350#if SIZEOF_WCHAR_T == 2
1351 /* This is the only case which has to process surrogates, thus
1352 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001353 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001354#else
1355 assert(num_surrogates == 0);
1356 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1357#endif
1358 break;
1359 default:
1360 assert(0 && "Impossible state");
1361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362
1363 return (PyObject *)unicode;
1364}
1365
Alexander Belopolsky40018472011-02-26 01:02:56 +00001366PyObject *
1367PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001368{
1369 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001370
Benjamin Peterson14339b62009-01-31 16:36:08 +00001371 if (size < 0) {
1372 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001374 return NULL;
1375 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001376
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001377 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001378 some optimizations which share commonly used objects.
1379 Also, this means the input must be UTF-8, so fall back to the
1380 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001381 if (u != NULL) {
1382
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 /* Optimization for empty strings */
1384 if (size == 0 && unicode_empty != NULL) {
1385 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001386 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001387 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001388
1389 /* Single characters are shared when using this constructor.
1390 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 if (size == 1 && Py_CHARMASK(*u) < 128)
1392 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001393
1394 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001395 }
1396
Walter Dörwald55507312007-05-18 13:12:10 +00001397 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001398 if (!unicode)
1399 return NULL;
1400
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001401 return (PyObject *)unicode;
1402}
1403
Alexander Belopolsky40018472011-02-26 01:02:56 +00001404PyObject *
1405PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001406{
1407 size_t size = strlen(u);
1408 if (size > PY_SSIZE_T_MAX) {
1409 PyErr_SetString(PyExc_OverflowError, "input too long");
1410 return NULL;
1411 }
1412
1413 return PyUnicode_FromStringAndSize(u, size);
1414}
1415
Victor Stinnere57b1c02011-09-28 22:20:48 +02001416static PyObject*
1417_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001418{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 PyObject *res;
1420 unsigned char max = 127;
1421 Py_ssize_t i;
1422 for (i = 0; i < size; i++) {
1423 if (u[i] & 0x80) {
1424 max = 255;
1425 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001426 }
1427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 res = PyUnicode_New(size, max);
1429 if (!res)
1430 return NULL;
1431 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1432 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001433}
1434
Victor Stinnere57b1c02011-09-28 22:20:48 +02001435static PyObject*
1436_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437{
1438 PyObject *res;
1439 Py_UCS2 max = 0;
1440 Py_ssize_t i;
1441 for (i = 0; i < size; i++)
1442 if (u[i] > max)
1443 max = u[i];
1444 res = PyUnicode_New(size, max);
1445 if (!res)
1446 return NULL;
1447 if (max >= 256)
1448 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1449 else
1450 for (i = 0; i < size; i++)
1451 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1452 return res;
1453}
1454
Victor Stinnere57b1c02011-09-28 22:20:48 +02001455static PyObject*
1456_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457{
1458 PyObject *res;
1459 Py_UCS4 max = 0;
1460 Py_ssize_t i;
1461 for (i = 0; i < size; i++)
1462 if (u[i] > max)
1463 max = u[i];
1464 res = PyUnicode_New(size, max);
1465 if (!res)
1466 return NULL;
1467 if (max >= 0x10000)
1468 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1469 else {
1470 int kind = PyUnicode_KIND(res);
1471 void *data = PyUnicode_DATA(res);
1472 for (i = 0; i < size; i++)
1473 PyUnicode_WRITE(kind, data, i, u[i]);
1474 }
1475 return res;
1476}
1477
1478PyObject*
1479PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1480{
1481 switch(kind) {
1482 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001483 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001485 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001487 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001489 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 return NULL;
1491}
1492
Victor Stinner034f6cf2011-09-30 02:26:44 +02001493PyObject*
1494PyUnicode_Copy(PyObject *unicode)
1495{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001496 Py_ssize_t size;
1497 PyObject *copy;
1498 void *data;
1499
Victor Stinner034f6cf2011-09-30 02:26:44 +02001500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadInternalCall();
1502 return NULL;
1503 }
1504 if (PyUnicode_READY(unicode))
1505 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001506
1507 size = PyUnicode_GET_LENGTH(unicode);
1508 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1509 if (!copy)
1510 return NULL;
1511 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1512
1513 data = PyUnicode_DATA(unicode);
1514 switch (PyUnicode_KIND(unicode))
1515 {
1516 case PyUnicode_1BYTE_KIND:
1517 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1518 break;
1519 case PyUnicode_2BYTE_KIND:
1520 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1521 break;
1522 case PyUnicode_4BYTE_KIND:
1523 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1524 break;
1525 default:
1526 assert(0);
1527 break;
1528 }
1529 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001530}
1531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerbc603d12011-10-02 01:00:40 +02001533/* Widen Unicode objects to larger buffers. Don't write terminating null
1534 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001535
1536void*
1537_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1538{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001539 Py_ssize_t len;
1540 void *result;
1541 unsigned int skind;
1542
1543 if (PyUnicode_READY(s))
1544 return NULL;
1545
1546 len = PyUnicode_GET_LENGTH(s);
1547 skind = PyUnicode_KIND(s);
1548 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1550 return NULL;
1551 }
1552 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001553 case PyUnicode_2BYTE_KIND:
1554 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1555 if (!result)
1556 return PyErr_NoMemory();
1557 assert(skind == PyUnicode_1BYTE_KIND);
1558 _PyUnicode_CONVERT_BYTES(
1559 Py_UCS1, Py_UCS2,
1560 PyUnicode_1BYTE_DATA(s),
1561 PyUnicode_1BYTE_DATA(s) + len,
1562 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001563 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001564 case PyUnicode_4BYTE_KIND:
1565 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1566 if (!result)
1567 return PyErr_NoMemory();
1568 if (skind == PyUnicode_2BYTE_KIND) {
1569 _PyUnicode_CONVERT_BYTES(
1570 Py_UCS2, Py_UCS4,
1571 PyUnicode_2BYTE_DATA(s),
1572 PyUnicode_2BYTE_DATA(s) + len,
1573 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001575 else {
1576 assert(skind == PyUnicode_1BYTE_KIND);
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS1, Py_UCS4,
1579 PyUnicode_1BYTE_DATA(s),
1580 PyUnicode_1BYTE_DATA(s) + len,
1581 result);
1582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001583 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001584 default:
1585 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001587 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 return NULL;
1589}
1590
1591static Py_UCS4*
1592as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1593 int copy_null)
1594{
1595 int kind;
1596 void *data;
1597 Py_ssize_t len, targetlen;
1598 if (PyUnicode_READY(string) == -1)
1599 return NULL;
1600 kind = PyUnicode_KIND(string);
1601 data = PyUnicode_DATA(string);
1602 len = PyUnicode_GET_LENGTH(string);
1603 targetlen = len;
1604 if (copy_null)
1605 targetlen++;
1606 if (!target) {
1607 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1608 PyErr_NoMemory();
1609 return NULL;
1610 }
1611 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1612 if (!target) {
1613 PyErr_NoMemory();
1614 return NULL;
1615 }
1616 }
1617 else {
1618 if (targetsize < targetlen) {
1619 PyErr_Format(PyExc_SystemError,
1620 "string is longer than the buffer");
1621 if (copy_null && 0 < targetsize)
1622 target[0] = 0;
1623 return NULL;
1624 }
1625 }
1626 if (kind != PyUnicode_4BYTE_KIND) {
1627 Py_ssize_t i;
1628 for (i = 0; i < len; i++)
1629 target[i] = PyUnicode_READ(kind, data, i);
1630 }
1631 else
1632 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1633 if (copy_null)
1634 target[len] = 0;
1635 return target;
1636}
1637
1638Py_UCS4*
1639PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1640 int copy_null)
1641{
1642 if (target == NULL || targetsize < 1) {
1643 PyErr_BadInternalCall();
1644 return NULL;
1645 }
1646 return as_ucs4(string, target, targetsize, copy_null);
1647}
1648
1649Py_UCS4*
1650PyUnicode_AsUCS4Copy(PyObject *string)
1651{
1652 return as_ucs4(string, NULL, 0, 1);
1653}
1654
1655#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001656
Alexander Belopolsky40018472011-02-26 01:02:56 +00001657PyObject *
1658PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001661 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001663 PyErr_BadInternalCall();
1664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 }
1666
Martin v. Löwis790465f2008-04-05 20:41:37 +00001667 if (size == -1) {
1668 size = wcslen(w);
1669 }
1670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672}
1673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001674#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001675
Walter Dörwald346737f2007-05-31 10:44:43 +00001676static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001677makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1678 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001679{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001680 *fmt++ = '%';
1681 if (width) {
1682 if (zeropad)
1683 *fmt++ = '0';
1684 fmt += sprintf(fmt, "%d", width);
1685 }
1686 if (precision)
1687 fmt += sprintf(fmt, ".%d", precision);
1688 if (longflag)
1689 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001690 else if (longlongflag) {
1691 /* longlongflag should only ever be nonzero on machines with
1692 HAVE_LONG_LONG defined */
1693#ifdef HAVE_LONG_LONG
1694 char *f = PY_FORMAT_LONG_LONG;
1695 while (*f)
1696 *fmt++ = *f++;
1697#else
1698 /* we shouldn't ever get here */
1699 assert(0);
1700 *fmt++ = 'l';
1701#endif
1702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001703 else if (size_tflag) {
1704 char *f = PY_FORMAT_SIZE_T;
1705 while (*f)
1706 *fmt++ = *f++;
1707 }
1708 *fmt++ = c;
1709 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001710}
1711
Victor Stinner96865452011-03-01 23:44:09 +00001712/* helper for PyUnicode_FromFormatV() */
1713
1714static const char*
1715parse_format_flags(const char *f,
1716 int *p_width, int *p_precision,
1717 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1718{
1719 int width, precision, longflag, longlongflag, size_tflag;
1720
1721 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1722 f++;
1723 width = 0;
1724 while (Py_ISDIGIT((unsigned)*f))
1725 width = (width*10) + *f++ - '0';
1726 precision = 0;
1727 if (*f == '.') {
1728 f++;
1729 while (Py_ISDIGIT((unsigned)*f))
1730 precision = (precision*10) + *f++ - '0';
1731 if (*f == '%') {
1732 /* "%.3%s" => f points to "3" */
1733 f--;
1734 }
1735 }
1736 if (*f == '\0') {
1737 /* bogus format "%.1" => go backward, f points to "1" */
1738 f--;
1739 }
1740 if (p_width != NULL)
1741 *p_width = width;
1742 if (p_precision != NULL)
1743 *p_precision = precision;
1744
1745 /* Handle %ld, %lu, %lld and %llu. */
1746 longflag = 0;
1747 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001748 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001749
1750 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001751 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001752 longflag = 1;
1753 ++f;
1754 }
1755#ifdef HAVE_LONG_LONG
1756 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001757 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001758 longlongflag = 1;
1759 f += 2;
1760 }
1761#endif
1762 }
1763 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001764 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001765 size_tflag = 1;
1766 ++f;
1767 }
1768 if (p_longflag != NULL)
1769 *p_longflag = longflag;
1770 if (p_longlongflag != NULL)
1771 *p_longlongflag = longlongflag;
1772 if (p_size_tflag != NULL)
1773 *p_size_tflag = size_tflag;
1774 return f;
1775}
1776
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001777/* maximum number of characters required for output of %ld. 21 characters
1778 allows for 64-bit integers (in decimal) and an optional sign. */
1779#define MAX_LONG_CHARS 21
1780/* maximum number of characters required for output of %lld.
1781 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1782 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1783#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1784
Walter Dörwaldd2034312007-05-18 16:29:38 +00001785PyObject *
1786PyUnicode_FromFormatV(const char *format, va_list vargs)
1787{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001788 va_list count;
1789 Py_ssize_t callcount = 0;
1790 PyObject **callresults = NULL;
1791 PyObject **callresult = NULL;
1792 Py_ssize_t n = 0;
1793 int width = 0;
1794 int precision = 0;
1795 int zeropad;
1796 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001798 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001799 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1801 Py_UCS4 argmaxchar;
1802 Py_ssize_t numbersize = 0;
1803 char *numberresults = NULL;
1804 char *numberresult = NULL;
1805 Py_ssize_t i;
1806 int kind;
1807 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001808
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001809 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001810 /* step 1: count the number of %S/%R/%A/%s format specifications
1811 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1812 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 * result in an array)
1814 * also esimate a upper bound for all the number formats in the string,
1815 * numbers will be formated in step 3 and be keept in a '\0'-separated
1816 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001817 for (f = format; *f; f++) {
1818 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001819 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1821 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1822 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1823 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001826#ifdef HAVE_LONG_LONG
1827 if (longlongflag) {
1828 if (width < MAX_LONG_LONG_CHARS)
1829 width = MAX_LONG_LONG_CHARS;
1830 }
1831 else
1832#endif
1833 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1834 including sign. Decimal takes the most space. This
1835 isn't enough for octal. If a width is specified we
1836 need more (which we allocate later). */
1837 if (width < MAX_LONG_CHARS)
1838 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839
1840 /* account for the size + '\0' to separate numbers
1841 inside of the numberresults buffer */
1842 numbersize += (width + 1);
1843 }
1844 }
1845 else if ((unsigned char)*f > 127) {
1846 PyErr_Format(PyExc_ValueError,
1847 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1848 "string, got a non-ASCII byte: 0x%02x",
1849 (unsigned char)*f);
1850 return NULL;
1851 }
1852 }
1853 /* step 2: allocate memory for the results of
1854 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1855 if (callcount) {
1856 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1857 if (!callresults) {
1858 PyErr_NoMemory();
1859 return NULL;
1860 }
1861 callresult = callresults;
1862 }
1863 /* step 2.5: allocate memory for the results of formating numbers */
1864 if (numbersize) {
1865 numberresults = PyObject_Malloc(numbersize);
1866 if (!numberresults) {
1867 PyErr_NoMemory();
1868 goto fail;
1869 }
1870 numberresult = numberresults;
1871 }
1872
1873 /* step 3: format numbers and figure out how large a buffer we need */
1874 for (f = format; *f; f++) {
1875 if (*f == '%') {
1876 const char* p;
1877 int longflag;
1878 int longlongflag;
1879 int size_tflag;
1880 int numprinted;
1881
1882 p = f;
1883 zeropad = (f[1] == '0');
1884 f = parse_format_flags(f, &width, &precision,
1885 &longflag, &longlongflag, &size_tflag);
1886 switch (*f) {
1887 case 'c':
1888 {
1889 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001890 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 n++;
1892 break;
1893 }
1894 case '%':
1895 n++;
1896 break;
1897 case 'i':
1898 case 'd':
1899 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1900 width, precision, *f);
1901 if (longflag)
1902 numprinted = sprintf(numberresult, fmt,
1903 va_arg(count, long));
1904#ifdef HAVE_LONG_LONG
1905 else if (longlongflag)
1906 numprinted = sprintf(numberresult, fmt,
1907 va_arg(count, PY_LONG_LONG));
1908#endif
1909 else if (size_tflag)
1910 numprinted = sprintf(numberresult, fmt,
1911 va_arg(count, Py_ssize_t));
1912 else
1913 numprinted = sprintf(numberresult, fmt,
1914 va_arg(count, int));
1915 n += numprinted;
1916 /* advance by +1 to skip over the '\0' */
1917 numberresult += (numprinted + 1);
1918 assert(*(numberresult - 1) == '\0');
1919 assert(*(numberresult - 2) != '\0');
1920 assert(numprinted >= 0);
1921 assert(numberresult <= numberresults + numbersize);
1922 break;
1923 case 'u':
1924 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1925 width, precision, 'u');
1926 if (longflag)
1927 numprinted = sprintf(numberresult, fmt,
1928 va_arg(count, unsigned long));
1929#ifdef HAVE_LONG_LONG
1930 else if (longlongflag)
1931 numprinted = sprintf(numberresult, fmt,
1932 va_arg(count, unsigned PY_LONG_LONG));
1933#endif
1934 else if (size_tflag)
1935 numprinted = sprintf(numberresult, fmt,
1936 va_arg(count, size_t));
1937 else
1938 numprinted = sprintf(numberresult, fmt,
1939 va_arg(count, unsigned int));
1940 n += numprinted;
1941 numberresult += (numprinted + 1);
1942 assert(*(numberresult - 1) == '\0');
1943 assert(*(numberresult - 2) != '\0');
1944 assert(numprinted >= 0);
1945 assert(numberresult <= numberresults + numbersize);
1946 break;
1947 case 'x':
1948 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1949 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1950 n += numprinted;
1951 numberresult += (numprinted + 1);
1952 assert(*(numberresult - 1) == '\0');
1953 assert(*(numberresult - 2) != '\0');
1954 assert(numprinted >= 0);
1955 assert(numberresult <= numberresults + numbersize);
1956 break;
1957 case 'p':
1958 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1959 /* %p is ill-defined: ensure leading 0x. */
1960 if (numberresult[1] == 'X')
1961 numberresult[1] = 'x';
1962 else if (numberresult[1] != 'x') {
1963 memmove(numberresult + 2, numberresult,
1964 strlen(numberresult) + 1);
1965 numberresult[0] = '0';
1966 numberresult[1] = 'x';
1967 numprinted += 2;
1968 }
1969 n += numprinted;
1970 numberresult += (numprinted + 1);
1971 assert(*(numberresult - 1) == '\0');
1972 assert(*(numberresult - 2) != '\0');
1973 assert(numprinted >= 0);
1974 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001975 break;
1976 case 's':
1977 {
1978 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001979 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001980 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1981 if (!str)
1982 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 /* since PyUnicode_DecodeUTF8 returns already flexible
1984 unicode objects, there is no need to call ready on them */
1985 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001986 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001988 /* Remember the str and switch to the next slot */
1989 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001990 break;
1991 }
1992 case 'U':
1993 {
1994 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001995 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 if (PyUnicode_READY(obj) == -1)
1997 goto fail;
1998 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001999 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002001 break;
2002 }
2003 case 'V':
2004 {
2005 PyObject *obj = va_arg(count, PyObject *);
2006 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002007 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002008 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002009 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002010 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 if (PyUnicode_READY(obj) == -1)
2012 goto fail;
2013 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002014 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002016 *callresult++ = NULL;
2017 }
2018 else {
2019 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2020 if (!str_obj)
2021 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002023 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002025 *callresult++ = str_obj;
2026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002027 break;
2028 }
2029 case 'S':
2030 {
2031 PyObject *obj = va_arg(count, PyObject *);
2032 PyObject *str;
2033 assert(obj);
2034 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002036 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002038 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002040 /* Remember the str and switch to the next slot */
2041 *callresult++ = str;
2042 break;
2043 }
2044 case 'R':
2045 {
2046 PyObject *obj = va_arg(count, PyObject *);
2047 PyObject *repr;
2048 assert(obj);
2049 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002051 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002053 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002055 /* Remember the repr and switch to the next slot */
2056 *callresult++ = repr;
2057 break;
2058 }
2059 case 'A':
2060 {
2061 PyObject *obj = va_arg(count, PyObject *);
2062 PyObject *ascii;
2063 assert(obj);
2064 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002066 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002067 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002068 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002070 /* Remember the repr and switch to the next slot */
2071 *callresult++ = ascii;
2072 break;
2073 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002074 default:
2075 /* if we stumble upon an unknown
2076 formatting code, copy the rest of
2077 the format string to the output
2078 string. (we cannot just skip the
2079 code, since there's no way to know
2080 what's in the argument list) */
2081 n += strlen(p);
2082 goto expand;
2083 }
2084 } else
2085 n++;
2086 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002090 we don't have to resize the string.
2091 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002093 if (!string)
2094 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 kind = PyUnicode_KIND(string);
2096 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002097 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002101 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002102 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002103
2104 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2106 /* checking for == because the last argument could be a empty
2107 string, which causes i to point to end, the assert at the end of
2108 the loop */
2109 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002110
Benjamin Peterson14339b62009-01-31 16:36:08 +00002111 switch (*f) {
2112 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002113 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002114 const int ordinal = va_arg(vargs, int);
2115 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002116 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002117 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002118 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002119 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002120 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002121 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 case 'p':
2123 /* unused, since we already have the result */
2124 if (*f == 'p')
2125 (void) va_arg(vargs, void *);
2126 else
2127 (void) va_arg(vargs, int);
2128 /* extract the result from numberresults and append. */
2129 for (; *numberresult; ++i, ++numberresult)
2130 PyUnicode_WRITE(kind, data, i, *numberresult);
2131 /* skip over the separating '\0' */
2132 assert(*numberresult == '\0');
2133 numberresult++;
2134 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002135 break;
2136 case 's':
2137 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002138 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002140 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002141 size = PyUnicode_GET_LENGTH(*callresult);
2142 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002143 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2144 *callresult, 0,
2145 size) < 0)
2146 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002148 /* We're done with the unicode()/repr() => forget it */
2149 Py_DECREF(*callresult);
2150 /* switch to next unicode()/repr() result */
2151 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 break;
2153 }
2154 case 'U':
2155 {
2156 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 Py_ssize_t size;
2158 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2159 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002160 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2161 obj, 0,
2162 size) < 0)
2163 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002165 break;
2166 }
2167 case 'V':
2168 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002170 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002171 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002172 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 size = PyUnicode_GET_LENGTH(obj);
2174 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002175 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2176 obj, 0,
2177 size) < 0)
2178 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002180 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 size = PyUnicode_GET_LENGTH(*callresult);
2182 assert(PyUnicode_KIND(*callresult) <=
2183 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002184 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2185 *callresult,
2186 0, size) < 0)
2187 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002189 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002190 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002191 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002192 break;
2193 }
2194 case 'S':
2195 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002196 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002197 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* unused, since we already have the result */
2199 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002201 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2202 *callresult, 0,
2203 PyUnicode_GET_LENGTH(*callresult)) < 0)
2204 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002206 /* We're done with the unicode()/repr() => forget it */
2207 Py_DECREF(*callresult);
2208 /* switch to next unicode()/repr() result */
2209 ++callresult;
2210 break;
2211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 break;
2215 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 for (; *p; ++p, ++i)
2217 PyUnicode_WRITE(kind, data, i, *p);
2218 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002219 goto end;
2220 }
Victor Stinner1205f272010-09-11 00:54:47 +00002221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 else {
2223 assert(i < PyUnicode_GET_LENGTH(string));
2224 PyUnicode_WRITE(kind, data, i++, *f);
2225 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002227 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002228
Benjamin Peterson29060642009-01-31 22:14:21 +00002229 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002230 if (callresults)
2231 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 if (numberresults)
2233 PyObject_Free(numberresults);
2234 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002235 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 if (callresults) {
2237 PyObject **callresult2 = callresults;
2238 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002239 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 ++callresult2;
2241 }
2242 PyObject_Free(callresults);
2243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 if (numberresults)
2245 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002247}
2248
Walter Dörwaldd2034312007-05-18 16:29:38 +00002249PyObject *
2250PyUnicode_FromFormat(const char *format, ...)
2251{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002252 PyObject* ret;
2253 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002254
2255#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002258 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002259#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002260 ret = PyUnicode_FromFormatV(format, vargs);
2261 va_end(vargs);
2262 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002263}
2264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265#ifdef HAVE_WCHAR_H
2266
Victor Stinner5593d8a2010-10-02 11:11:27 +00002267/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2268 convert a Unicode object to a wide character string.
2269
Victor Stinnerd88d9832011-09-06 02:00:05 +02002270 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002271 character) required to convert the unicode object. Ignore size argument.
2272
Victor Stinnerd88d9832011-09-06 02:00:05 +02002273 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002274 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002275 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002276static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002277unicode_aswidechar(PyUnicodeObject *unicode,
2278 wchar_t *w,
2279 Py_ssize_t size)
2280{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002281 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 const wchar_t *wstr;
2283
2284 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2285 if (wstr == NULL)
2286 return -1;
2287
Victor Stinner5593d8a2010-10-02 11:11:27 +00002288 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002289 if (size > res)
2290 size = res + 1;
2291 else
2292 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002294 return res;
2295 }
2296 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002298}
2299
2300Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002301PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002302 wchar_t *w,
2303 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304{
2305 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002306 PyErr_BadInternalCall();
2307 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002309 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Victor Stinner137c34c2010-09-29 10:25:54 +00002312wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002313PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002314 Py_ssize_t *size)
2315{
2316 wchar_t* buffer;
2317 Py_ssize_t buflen;
2318
2319 if (unicode == NULL) {
2320 PyErr_BadInternalCall();
2321 return NULL;
2322 }
2323
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002324 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 if (buflen == -1)
2326 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002327 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002328 PyErr_NoMemory();
2329 return NULL;
2330 }
2331
Victor Stinner137c34c2010-09-29 10:25:54 +00002332 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2333 if (buffer == NULL) {
2334 PyErr_NoMemory();
2335 return NULL;
2336 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 if (buflen == -1)
2339 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002340 if (size != NULL)
2341 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002342 return buffer;
2343}
2344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002346
Alexander Belopolsky40018472011-02-26 01:02:56 +00002347PyObject *
2348PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002351 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002352 PyErr_SetString(PyExc_ValueError,
2353 "chr() arg not in range(0x110000)");
2354 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002355 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 if (ordinal < 256)
2358 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 v = PyUnicode_New(1, ordinal);
2361 if (v == NULL)
2362 return NULL;
2363 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2364 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002365}
2366
Alexander Belopolsky40018472011-02-26 01:02:56 +00002367PyObject *
2368PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002370 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002371 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002372 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002373 if (PyUnicode_READY(obj))
2374 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002375 Py_INCREF(obj);
2376 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002377 }
2378 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 /* For a Unicode subtype that's not a Unicode object,
2380 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002381 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002382 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002383 PyErr_Format(PyExc_TypeError,
2384 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002385 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002386 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002387}
2388
Alexander Belopolsky40018472011-02-26 01:02:56 +00002389PyObject *
2390PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002391 const char *encoding,
2392 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002393{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002394 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002395 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002396
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002398 PyErr_BadInternalCall();
2399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002401
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002402 /* Decoding bytes objects is the most common case and should be fast */
2403 if (PyBytes_Check(obj)) {
2404 if (PyBytes_GET_SIZE(obj) == 0) {
2405 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002406 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002407 }
2408 else {
2409 v = PyUnicode_Decode(
2410 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2411 encoding, errors);
2412 }
2413 return v;
2414 }
2415
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002416 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002417 PyErr_SetString(PyExc_TypeError,
2418 "decoding str is not supported");
2419 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002420 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002421
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002422 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2423 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2424 PyErr_Format(PyExc_TypeError,
2425 "coercing to str: need bytes, bytearray "
2426 "or buffer-like object, %.80s found",
2427 Py_TYPE(obj)->tp_name);
2428 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002429 }
Tim Petersced69f82003-09-16 20:30:58 +00002430
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002431 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002432 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002433 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002434 }
Tim Petersced69f82003-09-16 20:30:58 +00002435 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002436 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002437
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002438 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002439 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440}
2441
Victor Stinner600d3be2010-06-10 12:00:55 +00002442/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002443 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2444 1 on success. */
2445static int
2446normalize_encoding(const char *encoding,
2447 char *lower,
2448 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002449{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002450 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002451 char *l;
2452 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002453
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002454 e = encoding;
2455 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002456 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002457 while (*e) {
2458 if (l == l_end)
2459 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002460 if (Py_ISUPPER(*e)) {
2461 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002462 }
2463 else if (*e == '_') {
2464 *l++ = '-';
2465 e++;
2466 }
2467 else {
2468 *l++ = *e++;
2469 }
2470 }
2471 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002472 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002473}
2474
Alexander Belopolsky40018472011-02-26 01:02:56 +00002475PyObject *
2476PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002477 Py_ssize_t size,
2478 const char *encoding,
2479 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002480{
2481 PyObject *buffer = NULL, *unicode;
2482 Py_buffer info;
2483 char lower[11]; /* Enough for any encoding shortcut */
2484
2485 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002486 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002487
2488 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002489 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002490 if ((strcmp(lower, "utf-8") == 0) ||
2491 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002492 return PyUnicode_DecodeUTF8(s, size, errors);
2493 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002494 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002495 (strcmp(lower, "iso-8859-1") == 0))
2496 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002497#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002498 else if (strcmp(lower, "mbcs") == 0)
2499 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002500#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002501 else if (strcmp(lower, "ascii") == 0)
2502 return PyUnicode_DecodeASCII(s, size, errors);
2503 else if (strcmp(lower, "utf-16") == 0)
2504 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2505 else if (strcmp(lower, "utf-32") == 0)
2506 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508
2509 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002510 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002511 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002512 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002513 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 if (buffer == NULL)
2515 goto onError;
2516 unicode = PyCodec_Decode(buffer, encoding, errors);
2517 if (unicode == NULL)
2518 goto onError;
2519 if (!PyUnicode_Check(unicode)) {
2520 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002521 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002522 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 Py_DECREF(unicode);
2524 goto onError;
2525 }
2526 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 if (PyUnicode_READY(unicode)) {
2528 Py_DECREF(unicode);
2529 return NULL;
2530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002532
Benjamin Peterson29060642009-01-31 22:14:21 +00002533 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 Py_XDECREF(buffer);
2535 return NULL;
2536}
2537
Alexander Belopolsky40018472011-02-26 01:02:56 +00002538PyObject *
2539PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002540 const char *encoding,
2541 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002542{
2543 PyObject *v;
2544
2545 if (!PyUnicode_Check(unicode)) {
2546 PyErr_BadArgument();
2547 goto onError;
2548 }
2549
2550 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002551 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002552
2553 /* Decode via the codec registry */
2554 v = PyCodec_Decode(unicode, encoding, errors);
2555 if (v == NULL)
2556 goto onError;
2557 return v;
2558
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002560 return NULL;
2561}
2562
Alexander Belopolsky40018472011-02-26 01:02:56 +00002563PyObject *
2564PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002565 const char *encoding,
2566 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002567{
2568 PyObject *v;
2569
2570 if (!PyUnicode_Check(unicode)) {
2571 PyErr_BadArgument();
2572 goto onError;
2573 }
2574
2575 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002576 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002577
2578 /* Decode via the codec registry */
2579 v = PyCodec_Decode(unicode, encoding, errors);
2580 if (v == NULL)
2581 goto onError;
2582 if (!PyUnicode_Check(v)) {
2583 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002584 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002585 Py_TYPE(v)->tp_name);
2586 Py_DECREF(v);
2587 goto onError;
2588 }
2589 return v;
2590
Benjamin Peterson29060642009-01-31 22:14:21 +00002591 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002592 return NULL;
2593}
2594
Alexander Belopolsky40018472011-02-26 01:02:56 +00002595PyObject *
2596PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002597 Py_ssize_t size,
2598 const char *encoding,
2599 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600{
2601 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002602
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 unicode = PyUnicode_FromUnicode(s, size);
2604 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2607 Py_DECREF(unicode);
2608 return v;
2609}
2610
Alexander Belopolsky40018472011-02-26 01:02:56 +00002611PyObject *
2612PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002613 const char *encoding,
2614 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002615{
2616 PyObject *v;
2617
2618 if (!PyUnicode_Check(unicode)) {
2619 PyErr_BadArgument();
2620 goto onError;
2621 }
2622
2623 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002624 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002625
2626 /* Encode via the codec registry */
2627 v = PyCodec_Encode(unicode, encoding, errors);
2628 if (v == NULL)
2629 goto onError;
2630 return v;
2631
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002633 return NULL;
2634}
2635
Victor Stinnerad158722010-10-27 00:25:46 +00002636PyObject *
2637PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002638{
Victor Stinner99b95382011-07-04 14:23:54 +02002639#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002640 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2641 PyUnicode_GET_SIZE(unicode),
2642 NULL);
2643#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002645#else
Victor Stinner793b5312011-04-27 00:24:21 +02002646 PyInterpreterState *interp = PyThreadState_GET()->interp;
2647 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2648 cannot use it to encode and decode filenames before it is loaded. Load
2649 the Python codec requires to encode at least its own filename. Use the C
2650 version of the locale codec until the codec registry is initialized and
2651 the Python codec is loaded.
2652
2653 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2654 cannot only rely on it: check also interp->fscodec_initialized for
2655 subinterpreters. */
2656 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002657 return PyUnicode_AsEncodedString(unicode,
2658 Py_FileSystemDefaultEncoding,
2659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002660 }
2661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002662 /* locale encoding with surrogateescape */
2663 wchar_t *wchar;
2664 char *bytes;
2665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002667
2668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2669 if (wchar == NULL)
2670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002671 bytes = _Py_wchar2char(wchar, &error_pos);
2672 if (bytes == NULL) {
2673 if (error_pos != (size_t)-1) {
2674 char *errmsg = strerror(errno);
2675 PyObject *exc = NULL;
2676 if (errmsg == NULL)
2677 errmsg = "Py_wchar2char() failed";
2678 raise_encode_exception(&exc,
2679 "filesystemencoding",
2680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2681 error_pos, error_pos+1,
2682 errmsg);
2683 Py_XDECREF(exc);
2684 }
2685 else
2686 PyErr_NoMemory();
2687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002689 }
2690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002691
2692 bytes_obj = PyBytes_FromString(bytes);
2693 PyMem_Free(bytes);
2694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002695 }
Victor Stinnerad158722010-10-27 00:25:46 +00002696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002697}
2698
Alexander Belopolsky40018472011-02-26 01:02:56 +00002699PyObject *
2700PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002701 const char *encoding,
2702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703{
2704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002706
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 if (!PyUnicode_Check(unicode)) {
2708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 }
Fred Drakee4315f52000-05-09 19:53:39 +00002711
Victor Stinner2f283c22011-03-02 01:21:46 +00002712 if (encoding == NULL) {
2713 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002715 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002717 }
Fred Drakee4315f52000-05-09 19:53:39 +00002718
2719 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002720 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002721 if ((strcmp(lower, "utf-8") == 0) ||
2722 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002723 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002724 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002726 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002728 }
Victor Stinner37296e82010-06-10 13:36:23 +00002729 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002730 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002731 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002733#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002734 else if (strcmp(lower, "mbcs") == 0)
2735 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2736 PyUnicode_GET_SIZE(unicode),
2737 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002738#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002739 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742
2743 /* Encode via the codec registry */
2744 v = PyCodec_Encode(unicode, encoding, errors);
2745 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002746 return NULL;
2747
2748 /* The normal path */
2749 if (PyBytes_Check(v))
2750 return v;
2751
2752 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002753 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002754 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002755 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002756
2757 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2758 "encoder %s returned bytearray instead of bytes",
2759 encoding);
2760 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002761 Py_DECREF(v);
2762 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002763 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002764
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002765 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2766 Py_DECREF(v);
2767 return b;
2768 }
2769
2770 PyErr_Format(PyExc_TypeError,
2771 "encoder did not return a bytes object (type=%.400s)",
2772 Py_TYPE(v)->tp_name);
2773 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002774 return NULL;
2775}
2776
Alexander Belopolsky40018472011-02-26 01:02:56 +00002777PyObject *
2778PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002779 const char *encoding,
2780 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002781{
2782 PyObject *v;
2783
2784 if (!PyUnicode_Check(unicode)) {
2785 PyErr_BadArgument();
2786 goto onError;
2787 }
2788
2789 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002791
2792 /* Encode via the codec registry */
2793 v = PyCodec_Encode(unicode, encoding, errors);
2794 if (v == NULL)
2795 goto onError;
2796 if (!PyUnicode_Check(v)) {
2797 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002798 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002799 Py_TYPE(v)->tp_name);
2800 Py_DECREF(v);
2801 goto onError;
2802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002804
Benjamin Peterson29060642009-01-31 22:14:21 +00002805 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 return NULL;
2807}
2808
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002809PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002810PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002811 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002812 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2813}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002814
Christian Heimes5894ba72007-11-04 11:43:14 +00002815PyObject*
2816PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2817{
Victor Stinner99b95382011-07-04 14:23:54 +02002818#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002819 return PyUnicode_DecodeMBCS(s, size, NULL);
2820#elif defined(__APPLE__)
2821 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2822#else
Victor Stinner793b5312011-04-27 00:24:21 +02002823 PyInterpreterState *interp = PyThreadState_GET()->interp;
2824 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2825 cannot use it to encode and decode filenames before it is loaded. Load
2826 the Python codec requires to encode at least its own filename. Use the C
2827 version of the locale codec until the codec registry is initialized and
2828 the Python codec is loaded.
2829
2830 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2831 cannot only rely on it: check also interp->fscodec_initialized for
2832 subinterpreters. */
2833 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002834 return PyUnicode_Decode(s, size,
2835 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002836 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002837 }
2838 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002839 /* locale encoding with surrogateescape */
2840 wchar_t *wchar;
2841 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002842 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002843
2844 if (s[size] != '\0' || size != strlen(s)) {
2845 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2846 return NULL;
2847 }
2848
Victor Stinner168e1172010-10-16 23:16:16 +00002849 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002850 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002851 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002852
Victor Stinner168e1172010-10-16 23:16:16 +00002853 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002854 PyMem_Free(wchar);
2855 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002856 }
Victor Stinnerad158722010-10-27 00:25:46 +00002857#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002858}
2859
Martin v. Löwis011e8422009-05-05 04:43:17 +00002860
2861int
2862PyUnicode_FSConverter(PyObject* arg, void* addr)
2863{
2864 PyObject *output = NULL;
2865 Py_ssize_t size;
2866 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002867 if (arg == NULL) {
2868 Py_DECREF(*(PyObject**)addr);
2869 return 1;
2870 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002871 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002872 output = arg;
2873 Py_INCREF(output);
2874 }
2875 else {
2876 arg = PyUnicode_FromObject(arg);
2877 if (!arg)
2878 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002879 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002880 Py_DECREF(arg);
2881 if (!output)
2882 return 0;
2883 if (!PyBytes_Check(output)) {
2884 Py_DECREF(output);
2885 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2886 return 0;
2887 }
2888 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002889 size = PyBytes_GET_SIZE(output);
2890 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002891 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002892 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002893 Py_DECREF(output);
2894 return 0;
2895 }
2896 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002897 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002898}
2899
2900
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002901int
2902PyUnicode_FSDecoder(PyObject* arg, void* addr)
2903{
2904 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002905 if (arg == NULL) {
2906 Py_DECREF(*(PyObject**)addr);
2907 return 1;
2908 }
2909 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002910 if (PyUnicode_READY(arg))
2911 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002912 output = arg;
2913 Py_INCREF(output);
2914 }
2915 else {
2916 arg = PyBytes_FromObject(arg);
2917 if (!arg)
2918 return 0;
2919 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2920 PyBytes_GET_SIZE(arg));
2921 Py_DECREF(arg);
2922 if (!output)
2923 return 0;
2924 if (!PyUnicode_Check(output)) {
2925 Py_DECREF(output);
2926 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2927 return 0;
2928 }
2929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002930 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2931 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002932 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2933 Py_DECREF(output);
2934 return 0;
2935 }
2936 *(PyObject**)addr = output;
2937 return Py_CLEANUP_SUPPORTED;
2938}
2939
2940
Martin v. Löwis5b222132007-06-10 09:51:05 +00002941char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002943{
Christian Heimesf3863112007-11-22 07:46:41 +00002944 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2946
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002947 if (!PyUnicode_Check(unicode)) {
2948 PyErr_BadArgument();
2949 return NULL;
2950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002951 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002952 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002953
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002954 if (PyUnicode_UTF8(unicode) == NULL) {
2955 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002956 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2957 if (bytes == NULL)
2958 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002959 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2960 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 Py_DECREF(bytes);
2962 return NULL;
2963 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002964 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2965 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 Py_DECREF(bytes);
2967 }
2968
2969 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002970 *psize = PyUnicode_UTF8_LENGTH(unicode);
2971 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002972}
2973
2974char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002975PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002977 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2978}
2979
2980#ifdef Py_DEBUG
2981int unicode_as_unicode_calls = 0;
2982#endif
2983
2984
2985Py_UNICODE *
2986PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2987{
2988 PyUnicodeObject *u;
2989 const unsigned char *one_byte;
2990#if SIZEOF_WCHAR_T == 4
2991 const Py_UCS2 *two_bytes;
2992#else
2993 const Py_UCS4 *four_bytes;
2994 const Py_UCS4 *ucs4_end;
2995 Py_ssize_t num_surrogates;
2996#endif
2997 wchar_t *w;
2998 wchar_t *wchar_end;
2999
3000 if (!PyUnicode_Check(unicode)) {
3001 PyErr_BadArgument();
3002 return NULL;
3003 }
3004 u = (PyUnicodeObject*)unicode;
3005 if (_PyUnicode_WSTR(u) == NULL) {
3006 /* Non-ASCII compact unicode object */
3007 assert(_PyUnicode_KIND(u) != 0);
3008 assert(PyUnicode_IS_READY(u));
3009
3010#ifdef Py_DEBUG
3011 ++unicode_as_unicode_calls;
3012#endif
3013
3014 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3015#if SIZEOF_WCHAR_T == 2
3016 four_bytes = PyUnicode_4BYTE_DATA(u);
3017 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3018 num_surrogates = 0;
3019
3020 for (; four_bytes < ucs4_end; ++four_bytes) {
3021 if (*four_bytes > 0xFFFF)
3022 ++num_surrogates;
3023 }
3024
3025 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3026 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3027 if (!_PyUnicode_WSTR(u)) {
3028 PyErr_NoMemory();
3029 return NULL;
3030 }
3031 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3032
3033 w = _PyUnicode_WSTR(u);
3034 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3035 four_bytes = PyUnicode_4BYTE_DATA(u);
3036 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3037 if (*four_bytes > 0xFFFF) {
3038 /* encode surrogate pair in this case */
3039 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3040 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3041 }
3042 else
3043 *w = *four_bytes;
3044
3045 if (w > wchar_end) {
3046 assert(0 && "Miscalculated string end");
3047 }
3048 }
3049 *w = 0;
3050#else
3051 /* sizeof(wchar_t) == 4 */
3052 Py_FatalError("Impossible unicode object state, wstr and str "
3053 "should share memory already.");
3054 return NULL;
3055#endif
3056 }
3057 else {
3058 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3059 (_PyUnicode_LENGTH(u) + 1));
3060 if (!_PyUnicode_WSTR(u)) {
3061 PyErr_NoMemory();
3062 return NULL;
3063 }
3064 if (!PyUnicode_IS_COMPACT_ASCII(u))
3065 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3066 w = _PyUnicode_WSTR(u);
3067 wchar_end = w + _PyUnicode_LENGTH(u);
3068
3069 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3070 one_byte = PyUnicode_1BYTE_DATA(u);
3071 for (; w < wchar_end; ++one_byte, ++w)
3072 *w = *one_byte;
3073 /* null-terminate the wstr */
3074 *w = 0;
3075 }
3076 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3077#if SIZEOF_WCHAR_T == 4
3078 two_bytes = PyUnicode_2BYTE_DATA(u);
3079 for (; w < wchar_end; ++two_bytes, ++w)
3080 *w = *two_bytes;
3081 /* null-terminate the wstr */
3082 *w = 0;
3083#else
3084 /* sizeof(wchar_t) == 2 */
3085 PyObject_FREE(_PyUnicode_WSTR(u));
3086 _PyUnicode_WSTR(u) = NULL;
3087 Py_FatalError("Impossible unicode object state, wstr "
3088 "and str should share memory already.");
3089 return NULL;
3090#endif
3091 }
3092 else {
3093 assert(0 && "This should never happen.");
3094 }
3095 }
3096 }
3097 if (size != NULL)
3098 *size = PyUnicode_WSTR_LENGTH(u);
3099 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003100}
3101
Alexander Belopolsky40018472011-02-26 01:02:56 +00003102Py_UNICODE *
3103PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003105 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003106}
3107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108
Alexander Belopolsky40018472011-02-26 01:02:56 +00003109Py_ssize_t
3110PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111{
3112 if (!PyUnicode_Check(unicode)) {
3113 PyErr_BadArgument();
3114 goto onError;
3115 }
3116 return PyUnicode_GET_SIZE(unicode);
3117
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 return -1;
3120}
3121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122Py_ssize_t
3123PyUnicode_GetLength(PyObject *unicode)
3124{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003125 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003126 PyErr_BadArgument();
3127 return -1;
3128 }
3129
3130 return PyUnicode_GET_LENGTH(unicode);
3131}
3132
3133Py_UCS4
3134PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3135{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003136 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3137 PyErr_BadArgument();
3138 return (Py_UCS4)-1;
3139 }
3140 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3141 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142 return (Py_UCS4)-1;
3143 }
3144 return PyUnicode_READ_CHAR(unicode, index);
3145}
3146
3147int
3148PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3149{
3150 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003151 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003152 return -1;
3153 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003154 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3155 PyErr_SetString(PyExc_IndexError, "string index out of range");
3156 return -1;
3157 }
3158 if (_PyUnicode_Dirty(unicode))
3159 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3161 index, ch);
3162 return 0;
3163}
3164
Alexander Belopolsky40018472011-02-26 01:02:56 +00003165const char *
3166PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003167{
Victor Stinner42cb4622010-09-01 19:39:01 +00003168 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003169}
3170
Victor Stinner554f3f02010-06-16 23:33:54 +00003171/* create or adjust a UnicodeDecodeError */
3172static void
3173make_decode_exception(PyObject **exceptionObject,
3174 const char *encoding,
3175 const char *input, Py_ssize_t length,
3176 Py_ssize_t startpos, Py_ssize_t endpos,
3177 const char *reason)
3178{
3179 if (*exceptionObject == NULL) {
3180 *exceptionObject = PyUnicodeDecodeError_Create(
3181 encoding, input, length, startpos, endpos, reason);
3182 }
3183 else {
3184 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3185 goto onError;
3186 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3187 goto onError;
3188 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3189 goto onError;
3190 }
3191 return;
3192
3193onError:
3194 Py_DECREF(*exceptionObject);
3195 *exceptionObject = NULL;
3196}
3197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198/* error handling callback helper:
3199 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003200 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201 and adjust various state variables.
3202 return 0 on success, -1 on error
3203*/
3204
Alexander Belopolsky40018472011-02-26 01:02:56 +00003205static int
3206unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003207 const char *encoding, const char *reason,
3208 const char **input, const char **inend, Py_ssize_t *startinpos,
3209 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3210 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003212 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213
3214 PyObject *restuple = NULL;
3215 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003216 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003217 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003218 Py_ssize_t requiredsize;
3219 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003220 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003221 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003222 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003223 int res = -1;
3224
3225 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 *errorHandler = PyCodec_LookupError(errors);
3227 if (*errorHandler == NULL)
3228 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003229 }
3230
Victor Stinner554f3f02010-06-16 23:33:54 +00003231 make_decode_exception(exceptionObject,
3232 encoding,
3233 *input, *inend - *input,
3234 *startinpos, *endinpos,
3235 reason);
3236 if (*exceptionObject == NULL)
3237 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238
3239 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3240 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003243 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003245 }
3246 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003248
3249 /* Copy back the bytes variables, which might have been modified by the
3250 callback */
3251 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3252 if (!inputobj)
3253 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003254 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003256 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003257 *input = PyBytes_AS_STRING(inputobj);
3258 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003259 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003260 /* we can DECREF safely, as the exception has another reference,
3261 so the object won't go away. */
3262 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003264 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003266 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3268 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003269 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003270
3271 /* need more space? (at least enough for what we
3272 have+the replacement+the rest of the string (starting
3273 at the new input position), so we won't have to check space
3274 when there are no errors in the rest of the string) */
3275 repptr = PyUnicode_AS_UNICODE(repunicode);
3276 repsize = PyUnicode_GET_SIZE(repunicode);
3277 requiredsize = *outpos + repsize + insize-newpos;
3278 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 if (requiredsize<2*outsize)
3280 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003281 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 goto onError;
3283 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003284 }
3285 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003286 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003287 Py_UNICODE_COPY(*outptr, repptr, repsize);
3288 *outptr += repsize;
3289 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* we made it! */
3292 res = 0;
3293
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003295 Py_XDECREF(restuple);
3296 return res;
3297}
3298
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003299/* --- UTF-7 Codec -------------------------------------------------------- */
3300
Antoine Pitrou244651a2009-05-04 18:56:13 +00003301/* See RFC2152 for details. We encode conservatively and decode liberally. */
3302
3303/* Three simple macros defining base-64. */
3304
3305/* Is c a base-64 character? */
3306
3307#define IS_BASE64(c) \
3308 (((c) >= 'A' && (c) <= 'Z') || \
3309 ((c) >= 'a' && (c) <= 'z') || \
3310 ((c) >= '0' && (c) <= '9') || \
3311 (c) == '+' || (c) == '/')
3312
3313/* given that c is a base-64 character, what is its base-64 value? */
3314
3315#define FROM_BASE64(c) \
3316 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3317 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3318 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3319 (c) == '+' ? 62 : 63)
3320
3321/* What is the base-64 character of the bottom 6 bits of n? */
3322
3323#define TO_BASE64(n) \
3324 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3325
3326/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3327 * decoded as itself. We are permissive on decoding; the only ASCII
3328 * byte not decoding to itself is the + which begins a base64
3329 * string. */
3330
3331#define DECODE_DIRECT(c) \
3332 ((c) <= 127 && (c) != '+')
3333
3334/* The UTF-7 encoder treats ASCII characters differently according to
3335 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3336 * the above). See RFC2152. This array identifies these different
3337 * sets:
3338 * 0 : "Set D"
3339 * alphanumeric and '(),-./:?
3340 * 1 : "Set O"
3341 * !"#$%&*;<=>@[]^_`{|}
3342 * 2 : "whitespace"
3343 * ht nl cr sp
3344 * 3 : special (must be base64 encoded)
3345 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3346 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003347
Tim Petersced69f82003-09-16 20:30:58 +00003348static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003349char utf7_category[128] = {
3350/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3351 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3352/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3353 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3354/* sp ! " # $ % & ' ( ) * + , - . / */
3355 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3356/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3357 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3358/* @ A B C D E F G H I J K L M N O */
3359 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3360/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3362/* ` a b c d e f g h i j k l m n o */
3363 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3364/* p q r s t u v w x y z { | } ~ del */
3365 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003366};
3367
Antoine Pitrou244651a2009-05-04 18:56:13 +00003368/* ENCODE_DIRECT: this character should be encoded as itself. The
3369 * answer depends on whether we are encoding set O as itself, and also
3370 * on whether we are encoding whitespace as itself. RFC2152 makes it
3371 * clear that the answers to these questions vary between
3372 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003373
Antoine Pitrou244651a2009-05-04 18:56:13 +00003374#define ENCODE_DIRECT(c, directO, directWS) \
3375 ((c) < 128 && (c) > 0 && \
3376 ((utf7_category[(c)] == 0) || \
3377 (directWS && (utf7_category[(c)] == 2)) || \
3378 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003379
Alexander Belopolsky40018472011-02-26 01:02:56 +00003380PyObject *
3381PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003382 Py_ssize_t size,
3383 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003384{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003385 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3386}
3387
Antoine Pitrou244651a2009-05-04 18:56:13 +00003388/* The decoder. The only state we preserve is our read position,
3389 * i.e. how many characters we have consumed. So if we end in the
3390 * middle of a shift sequence we have to back off the read position
3391 * and the output to the beginning of the sequence, otherwise we lose
3392 * all the shift state (seen bits, number of bits seen, high
3393 * surrogate). */
3394
Alexander Belopolsky40018472011-02-26 01:02:56 +00003395PyObject *
3396PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003397 Py_ssize_t size,
3398 const char *errors,
3399 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003400{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003402 Py_ssize_t startinpos;
3403 Py_ssize_t endinpos;
3404 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003405 const char *e;
3406 PyUnicodeObject *unicode;
3407 Py_UNICODE *p;
3408 const char *errmsg = "";
3409 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003410 Py_UNICODE *shiftOutStart;
3411 unsigned int base64bits = 0;
3412 unsigned long base64buffer = 0;
3413 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 PyObject *errorHandler = NULL;
3415 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416
3417 unicode = _PyUnicode_New(size);
3418 if (!unicode)
3419 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003420 if (size == 0) {
3421 if (consumed)
3422 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003423 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003424 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003427 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003428 e = s + size;
3429
3430 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003433 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003434
Antoine Pitrou244651a2009-05-04 18:56:13 +00003435 if (inShift) { /* in a base-64 section */
3436 if (IS_BASE64(ch)) { /* consume a base-64 character */
3437 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3438 base64bits += 6;
3439 s++;
3440 if (base64bits >= 16) {
3441 /* we have enough bits for a UTF-16 value */
3442 Py_UNICODE outCh = (Py_UNICODE)
3443 (base64buffer >> (base64bits-16));
3444 base64bits -= 16;
3445 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3446 if (surrogate) {
3447 /* expecting a second surrogate */
3448 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3449#ifdef Py_UNICODE_WIDE
3450 *p++ = (((surrogate & 0x3FF)<<10)
3451 | (outCh & 0x3FF)) + 0x10000;
3452#else
3453 *p++ = surrogate;
3454 *p++ = outCh;
3455#endif
3456 surrogate = 0;
3457 }
3458 else {
3459 surrogate = 0;
3460 errmsg = "second surrogate missing";
3461 goto utf7Error;
3462 }
3463 }
3464 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3465 /* first surrogate */
3466 surrogate = outCh;
3467 }
3468 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3469 errmsg = "unexpected second surrogate";
3470 goto utf7Error;
3471 }
3472 else {
3473 *p++ = outCh;
3474 }
3475 }
3476 }
3477 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003478 inShift = 0;
3479 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003480 if (surrogate) {
3481 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003482 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003483 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003484 if (base64bits > 0) { /* left-over bits */
3485 if (base64bits >= 6) {
3486 /* We've seen at least one base-64 character */
3487 errmsg = "partial character in shift sequence";
3488 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003489 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003490 else {
3491 /* Some bits remain; they should be zero */
3492 if (base64buffer != 0) {
3493 errmsg = "non-zero padding bits in shift sequence";
3494 goto utf7Error;
3495 }
3496 }
3497 }
3498 if (ch != '-') {
3499 /* '-' is absorbed; other terminating
3500 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003501 *p++ = ch;
3502 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003503 }
3504 }
3505 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003507 s++; /* consume '+' */
3508 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003509 s++;
3510 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003511 }
3512 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003513 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003514 shiftOutStart = p;
3515 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003516 }
3517 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003518 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003519 *p++ = ch;
3520 s++;
3521 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003522 else {
3523 startinpos = s-starts;
3524 s++;
3525 errmsg = "unexpected special character";
3526 goto utf7Error;
3527 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003528 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003529utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 outpos = p-PyUnicode_AS_UNICODE(unicode);
3531 endinpos = s-starts;
3532 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 errors, &errorHandler,
3534 "utf7", errmsg,
3535 &starts, &e, &startinpos, &endinpos, &exc, &s,
3536 &unicode, &outpos, &p))
3537 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003538 }
3539
Antoine Pitrou244651a2009-05-04 18:56:13 +00003540 /* end of string */
3541
3542 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3543 /* if we're in an inconsistent state, that's an error */
3544 if (surrogate ||
3545 (base64bits >= 6) ||
3546 (base64bits > 0 && base64buffer != 0)) {
3547 outpos = p-PyUnicode_AS_UNICODE(unicode);
3548 endinpos = size;
3549 if (unicode_decode_call_errorhandler(
3550 errors, &errorHandler,
3551 "utf7", "unterminated shift sequence",
3552 &starts, &e, &startinpos, &endinpos, &exc, &s,
3553 &unicode, &outpos, &p))
3554 goto onError;
3555 if (s < e)
3556 goto restart;
3557 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003558 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559
3560 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003561 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003562 if (inShift) {
3563 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003564 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003565 }
3566 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003567 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003568 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003569 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003570
Victor Stinnerfe226c02011-10-03 03:52:20 +02003571 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572 goto onError;
3573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003576 if (PyUnicode_READY(unicode) == -1) {
3577 Py_DECREF(unicode);
3578 return NULL;
3579 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003580 return (PyObject *)unicode;
3581
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 Py_XDECREF(errorHandler);
3584 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003585 Py_DECREF(unicode);
3586 return NULL;
3587}
3588
3589
Alexander Belopolsky40018472011-02-26 01:02:56 +00003590PyObject *
3591PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003592 Py_ssize_t size,
3593 int base64SetO,
3594 int base64WhiteSpace,
3595 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003596{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003597 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003598 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003599 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003600 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003602 unsigned int base64bits = 0;
3603 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604 char * out;
3605 char * start;
3606
3607 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003608 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003609
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003610 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003611 return PyErr_NoMemory();
3612
Antoine Pitrou244651a2009-05-04 18:56:13 +00003613 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003614 if (v == NULL)
3615 return NULL;
3616
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003617 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003618 for (;i < size; ++i) {
3619 Py_UNICODE ch = s[i];
3620
Antoine Pitrou244651a2009-05-04 18:56:13 +00003621 if (inShift) {
3622 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3623 /* shifting out */
3624 if (base64bits) { /* output remaining bits */
3625 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3626 base64buffer = 0;
3627 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628 }
3629 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003630 /* Characters not in the BASE64 set implicitly unshift the sequence
3631 so no '-' is required, except if the character is itself a '-' */
3632 if (IS_BASE64(ch) || ch == '-') {
3633 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003635 *out++ = (char) ch;
3636 }
3637 else {
3638 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003640 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003641 else { /* not in a shift sequence */
3642 if (ch == '+') {
3643 *out++ = '+';
3644 *out++ = '-';
3645 }
3646 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3647 *out++ = (char) ch;
3648 }
3649 else {
3650 *out++ = '+';
3651 inShift = 1;
3652 goto encode_char;
3653 }
3654 }
3655 continue;
3656encode_char:
3657#ifdef Py_UNICODE_WIDE
3658 if (ch >= 0x10000) {
3659 /* code first surrogate */
3660 base64bits += 16;
3661 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3662 while (base64bits >= 6) {
3663 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3664 base64bits -= 6;
3665 }
3666 /* prepare second surrogate */
3667 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3668 }
3669#endif
3670 base64bits += 16;
3671 base64buffer = (base64buffer << 16) | ch;
3672 while (base64bits >= 6) {
3673 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3674 base64bits -= 6;
3675 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003676 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003677 if (base64bits)
3678 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3679 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003680 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 if (_PyBytes_Resize(&v, out - start) < 0)
3682 return NULL;
3683 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003684}
3685
Antoine Pitrou244651a2009-05-04 18:56:13 +00003686#undef IS_BASE64
3687#undef FROM_BASE64
3688#undef TO_BASE64
3689#undef DECODE_DIRECT
3690#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692/* --- UTF-8 Codec -------------------------------------------------------- */
3693
Tim Petersced69f82003-09-16 20:30:58 +00003694static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003696 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3697 illegal prefix. See RFC 3629 for details */
3698 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3699 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003700 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3702 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3703 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3704 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003705 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003709 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3710 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3711 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3712 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3713 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714};
3715
Alexander Belopolsky40018472011-02-26 01:02:56 +00003716PyObject *
3717PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003718 Py_ssize_t size,
3719 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720{
Walter Dörwald69652032004-09-07 20:24:22 +00003721 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3722}
3723
Antoine Pitrouab868312009-01-10 15:40:25 +00003724/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3725#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3726
3727/* Mask to quickly check whether a C 'long' contains a
3728 non-ASCII, UTF8-encoded char. */
3729#if (SIZEOF_LONG == 8)
3730# define ASCII_CHAR_MASK 0x8080808080808080L
3731#elif (SIZEOF_LONG == 4)
3732# define ASCII_CHAR_MASK 0x80808080L
3733#else
3734# error C 'long' size should be either 4 or 8!
3735#endif
3736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737/* Scans a UTF-8 string and returns the maximum character to be expected,
3738 the size of the decoded unicode string and if any major errors were
3739 encountered.
3740
3741 This function does check basic UTF-8 sanity, it does however NOT CHECK
3742 if the string contains surrogates, and if all continuation bytes are
3743 within the correct ranges, these checks are performed in
3744 PyUnicode_DecodeUTF8Stateful.
3745
3746 If it sets has_errors to 1, it means the value of unicode_size and max_char
3747 will be bogus and you should not rely on useful information in them.
3748 */
3749static Py_UCS4
3750utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3751 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3752 int *has_errors)
3753{
3754 Py_ssize_t n;
3755 Py_ssize_t char_count = 0;
3756 Py_UCS4 max_char = 127, new_max;
3757 Py_UCS4 upper_bound;
3758 const unsigned char *p = (const unsigned char *)s;
3759 const unsigned char *end = p + string_size;
3760 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3761 int err = 0;
3762
3763 for (; p < end && !err; ++p, ++char_count) {
3764 /* Only check value if it's not a ASCII char... */
3765 if (*p < 0x80) {
3766 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3767 an explanation. */
3768 if (!((size_t) p & LONG_PTR_MASK)) {
3769 /* Help register allocation */
3770 register const unsigned char *_p = p;
3771 while (_p < aligned_end) {
3772 unsigned long value = *(unsigned long *) _p;
3773 if (value & ASCII_CHAR_MASK)
3774 break;
3775 _p += SIZEOF_LONG;
3776 char_count += SIZEOF_LONG;
3777 }
3778 p = _p;
3779 if (p == end)
3780 break;
3781 }
3782 }
3783 if (*p >= 0x80) {
3784 n = utf8_code_length[*p];
3785 new_max = max_char;
3786 switch (n) {
3787 /* invalid start byte */
3788 case 0:
3789 err = 1;
3790 break;
3791 case 2:
3792 /* Code points between 0x00FF and 0x07FF inclusive.
3793 Approximate the upper bound of the code point,
3794 if this flips over 255 we can be sure it will be more
3795 than 255 and the string will need 2 bytes per code coint,
3796 if it stays under or equal to 255, we can be sure 1 byte
3797 is enough.
3798 ((*p & 0b00011111) << 6) | 0b00111111 */
3799 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3800 if (max_char < upper_bound)
3801 new_max = upper_bound;
3802 /* Ensure we track at least that we left ASCII space. */
3803 if (new_max < 128)
3804 new_max = 128;
3805 break;
3806 case 3:
3807 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3808 always > 255 and <= 65535 and will always need 2 bytes. */
3809 if (max_char < 65535)
3810 new_max = 65535;
3811 break;
3812 case 4:
3813 /* Code point will be above 0xFFFF for sure in this case. */
3814 new_max = 65537;
3815 break;
3816 /* Internal error, this should be caught by the first if */
3817 case 1:
3818 default:
3819 assert(0 && "Impossible case in utf8_max_char_and_size");
3820 err = 1;
3821 }
3822 /* Instead of number of overall bytes for this code point,
3823 n containts the number of following bytes: */
3824 --n;
3825 /* Check if the follow up chars are all valid continuation bytes */
3826 if (n >= 1) {
3827 const unsigned char *cont;
3828 if ((p + n) >= end) {
3829 if (consumed == 0)
3830 /* incomplete data, non-incremental decoding */
3831 err = 1;
3832 break;
3833 }
3834 for (cont = p + 1; cont < (p + n); ++cont) {
3835 if ((*cont & 0xc0) != 0x80) {
3836 err = 1;
3837 break;
3838 }
3839 }
3840 p += n;
3841 }
3842 else
3843 err = 1;
3844 max_char = new_max;
3845 }
3846 }
3847
3848 if (unicode_size)
3849 *unicode_size = char_count;
3850 if (has_errors)
3851 *has_errors = err;
3852 return max_char;
3853}
3854
3855/* Similar to PyUnicode_WRITE but can also write into wstr field
3856 of the legacy unicode representation */
3857#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3858 do { \
3859 const int k_ = (kind); \
3860 if (k_ == PyUnicode_WCHAR_KIND) \
3861 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3862 else if (k_ == PyUnicode_1BYTE_KIND) \
3863 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3864 else if (k_ == PyUnicode_2BYTE_KIND) \
3865 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3866 else \
3867 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3868 } while (0)
3869
Alexander Belopolsky40018472011-02-26 01:02:56 +00003870PyObject *
3871PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 Py_ssize_t size,
3873 const char *errors,
3874 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003878 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003879 Py_ssize_t startinpos;
3880 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003881 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003882 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003883 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003884 PyObject *errorHandler = NULL;
3885 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 Py_UCS4 maxchar = 0;
3887 Py_ssize_t unicode_size;
3888 Py_ssize_t i;
3889 int kind;
3890 void *data;
3891 int has_errors;
3892 Py_UNICODE *error_outptr;
3893#if SIZEOF_WCHAR_T == 2
3894 Py_ssize_t wchar_offset = 0;
3895#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896
Walter Dörwald69652032004-09-07 20:24:22 +00003897 if (size == 0) {
3898 if (consumed)
3899 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3903 consumed, &has_errors);
3904 if (has_errors) {
3905 unicode = _PyUnicode_New(size);
3906 if (!unicode)
3907 return NULL;
3908 kind = PyUnicode_WCHAR_KIND;
3909 data = PyUnicode_AS_UNICODE(unicode);
3910 assert(data != NULL);
3911 }
3912 else {
3913 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3914 if (!unicode)
3915 return NULL;
3916 /* When the string is ASCII only, just use memcpy and return.
3917 unicode_size may be != size if there is an incomplete UTF-8
3918 sequence at the end of the ASCII block. */
3919 if (maxchar < 128 && size == unicode_size) {
3920 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3921 return (PyObject *)unicode;
3922 }
3923 kind = PyUnicode_KIND(unicode);
3924 data = PyUnicode_DATA(unicode);
3925 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003927 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003929 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003930
3931 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003932 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
3934 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003935 /* Fast path for runs of ASCII characters. Given that common UTF-8
3936 input will consist of an overwhelming majority of ASCII
3937 characters, we try to optimize for this case by checking
3938 as many characters as a C 'long' can contain.
3939 First, check if we can do an aligned read, as most CPUs have
3940 a penalty for unaligned reads.
3941 */
3942 if (!((size_t) s & LONG_PTR_MASK)) {
3943 /* Help register allocation */
3944 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003946 while (_s < aligned_end) {
3947 /* Read a whole long at a time (either 4 or 8 bytes),
3948 and do a fast unrolled copy if it only contains ASCII
3949 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 unsigned long value = *(unsigned long *) _s;
3951 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003952 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3954 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3955 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3956 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003957#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3959 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3960 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3961 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003962#endif
3963 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003965 }
3966 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003967 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003968 if (s == e)
3969 break;
3970 ch = (unsigned char)*s;
3971 }
3972 }
3973
3974 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 s++;
3977 continue;
3978 }
3979
3980 n = utf8_code_length[ch];
3981
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003982 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 if (consumed)
3984 break;
3985 else {
3986 errmsg = "unexpected end of data";
3987 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003988 endinpos = startinpos+1;
3989 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3990 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 goto utf8Error;
3992 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994
3995 switch (n) {
3996
3997 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003998 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 startinpos = s-starts;
4000 endinpos = startinpos+1;
4001 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004002
4003 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004004 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 startinpos = s-starts;
4006 endinpos = startinpos+1;
4007 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
4009 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004010 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004011 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004012 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004013 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 goto utf8Error;
4015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004017 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 break;
4020
4021 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004022 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4023 will result in surrogates in range d800-dfff. Surrogates are
4024 not valid UTF-8 so they are rejected.
4025 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4026 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004027 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004028 (s[2] & 0xc0) != 0x80 ||
4029 ((unsigned char)s[0] == 0xE0 &&
4030 (unsigned char)s[1] < 0xA0) ||
4031 ((unsigned char)s[0] == 0xED &&
4032 (unsigned char)s[1] > 0x9F)) {
4033 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004034 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004035 endinpos = startinpos + 1;
4036
4037 /* if s[1] first two bits are 1 and 0, then the invalid
4038 continuation byte is s[2], so increment endinpos by 1,
4039 if not, s[1] is invalid and endinpos doesn't need to
4040 be incremented. */
4041 if ((s[1] & 0xC0) == 0x80)
4042 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 goto utf8Error;
4044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004046 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004048 break;
4049
4050 case 4:
4051 if ((s[1] & 0xc0) != 0x80 ||
4052 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004053 (s[3] & 0xc0) != 0x80 ||
4054 ((unsigned char)s[0] == 0xF0 &&
4055 (unsigned char)s[1] < 0x90) ||
4056 ((unsigned char)s[0] == 0xF4 &&
4057 (unsigned char)s[1] > 0x8F)) {
4058 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004060 endinpos = startinpos + 1;
4061 if ((s[1] & 0xC0) == 0x80) {
4062 endinpos++;
4063 if ((s[2] & 0xC0) == 0x80)
4064 endinpos++;
4065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 goto utf8Error;
4067 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004068 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004069 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4070 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004072 /* If the string is flexible or we have native UCS-4, write
4073 directly.. */
4074 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4075 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 else {
4078 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 /* translate from 10000..10FFFF to 0..FFFF */
4081 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004083 /* high surrogate = top 10 bits added to D800 */
4084 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4085 (Py_UNICODE)(0xD800 + (ch >> 10)));
4086
4087 /* low surrogate = bottom 10 bits added to DC00 */
4088 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4089 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4090 }
4091#if SIZEOF_WCHAR_T == 2
4092 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004093#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 }
4096 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004097 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004098
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004100 /* If this is not yet a resizable string, make it one.. */
4101 if (kind != PyUnicode_WCHAR_KIND) {
4102 const Py_UNICODE *u;
4103 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4104 if (!new_unicode)
4105 goto onError;
4106 u = PyUnicode_AsUnicode((PyObject *)unicode);
4107 if (!u)
4108 goto onError;
4109#if SIZEOF_WCHAR_T == 2
4110 i += wchar_offset;
4111#endif
4112 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4113 Py_DECREF(unicode);
4114 unicode = new_unicode;
4115 kind = 0;
4116 data = PyUnicode_AS_UNICODE(new_unicode);
4117 assert(data != NULL);
4118 }
4119 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 if (unicode_decode_call_errorhandler(
4121 errors, &errorHandler,
4122 "utf8", errmsg,
4123 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126 /* Update data because unicode_decode_call_errorhandler might have
4127 re-created or resized the unicode object. */
4128 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131 /* Ensure the unicode_size calculation above was correct: */
4132 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4133
Walter Dörwald69652032004-09-07 20:24:22 +00004134 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004137 /* Adjust length and ready string when it contained errors and
4138 is of the old resizable kind. */
4139 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004140 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 PyUnicode_READY(unicode) == -1)
4142 goto onError;
4143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 Py_XDECREF(errorHandler);
4146 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 if (PyUnicode_READY(unicode) == -1) {
4148 Py_DECREF(unicode);
4149 return NULL;
4150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 return (PyObject *)unicode;
4152
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154 Py_XDECREF(errorHandler);
4155 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 Py_DECREF(unicode);
4157 return NULL;
4158}
4159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004160#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004161
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004162#ifdef __APPLE__
4163
4164/* Simplified UTF-8 decoder using surrogateescape error handler,
4165 used to decode the command line arguments on Mac OS X. */
4166
4167wchar_t*
4168_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4169{
4170 int n;
4171 const char *e;
4172 wchar_t *unicode, *p;
4173
4174 /* Note: size will always be longer than the resulting Unicode
4175 character count */
4176 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4177 PyErr_NoMemory();
4178 return NULL;
4179 }
4180 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4181 if (!unicode)
4182 return NULL;
4183
4184 /* Unpack UTF-8 encoded data */
4185 p = unicode;
4186 e = s + size;
4187 while (s < e) {
4188 Py_UCS4 ch = (unsigned char)*s;
4189
4190 if (ch < 0x80) {
4191 *p++ = (wchar_t)ch;
4192 s++;
4193 continue;
4194 }
4195
4196 n = utf8_code_length[ch];
4197 if (s + n > e) {
4198 goto surrogateescape;
4199 }
4200
4201 switch (n) {
4202 case 0:
4203 case 1:
4204 goto surrogateescape;
4205
4206 case 2:
4207 if ((s[1] & 0xc0) != 0x80)
4208 goto surrogateescape;
4209 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4210 assert ((ch > 0x007F) && (ch <= 0x07FF));
4211 *p++ = (wchar_t)ch;
4212 break;
4213
4214 case 3:
4215 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4216 will result in surrogates in range d800-dfff. Surrogates are
4217 not valid UTF-8 so they are rejected.
4218 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4219 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4220 if ((s[1] & 0xc0) != 0x80 ||
4221 (s[2] & 0xc0) != 0x80 ||
4222 ((unsigned char)s[0] == 0xE0 &&
4223 (unsigned char)s[1] < 0xA0) ||
4224 ((unsigned char)s[0] == 0xED &&
4225 (unsigned char)s[1] > 0x9F)) {
4226
4227 goto surrogateescape;
4228 }
4229 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4230 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004231 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004232 break;
4233
4234 case 4:
4235 if ((s[1] & 0xc0) != 0x80 ||
4236 (s[2] & 0xc0) != 0x80 ||
4237 (s[3] & 0xc0) != 0x80 ||
4238 ((unsigned char)s[0] == 0xF0 &&
4239 (unsigned char)s[1] < 0x90) ||
4240 ((unsigned char)s[0] == 0xF4 &&
4241 (unsigned char)s[1] > 0x8F)) {
4242 goto surrogateescape;
4243 }
4244 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4245 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4246 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4247
4248#if SIZEOF_WCHAR_T == 4
4249 *p++ = (wchar_t)ch;
4250#else
4251 /* compute and append the two surrogates: */
4252
4253 /* translate from 10000..10FFFF to 0..FFFF */
4254 ch -= 0x10000;
4255
4256 /* high surrogate = top 10 bits added to D800 */
4257 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4258
4259 /* low surrogate = bottom 10 bits added to DC00 */
4260 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4261#endif
4262 break;
4263 }
4264 s += n;
4265 continue;
4266
4267 surrogateescape:
4268 *p++ = 0xDC00 + ch;
4269 s++;
4270 }
4271 *p = L'\0';
4272 return unicode;
4273}
4274
4275#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004277/* Primary internal function which creates utf8 encoded bytes objects.
4278
4279 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004280 and allocate exactly as much space needed at the end. Else allocate the
4281 maximum possible needed (4 result bytes per Unicode character), and return
4282 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004283*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004284PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286{
Tim Peters602f7402002-04-27 18:03:26 +00004287#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004288
Guido van Rossum98297ee2007-11-06 21:34:58 +00004289 Py_ssize_t i; /* index into s of next input byte */
4290 PyObject *result; /* result string object */
4291 char *p; /* next free byte in output buffer */
4292 Py_ssize_t nallocated; /* number of result bytes allocated */
4293 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004294 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004295 PyObject *errorHandler = NULL;
4296 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 int kind;
4298 void *data;
4299 Py_ssize_t size;
4300 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4301#if SIZEOF_WCHAR_T == 2
4302 Py_ssize_t wchar_offset = 0;
4303#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004305 if (!PyUnicode_Check(unicode)) {
4306 PyErr_BadArgument();
4307 return NULL;
4308 }
4309
4310 if (PyUnicode_READY(unicode) == -1)
4311 return NULL;
4312
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004313 if (PyUnicode_UTF8(unicode))
4314 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4315 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316
4317 kind = PyUnicode_KIND(unicode);
4318 data = PyUnicode_DATA(unicode);
4319 size = PyUnicode_GET_LENGTH(unicode);
4320
Tim Peters602f7402002-04-27 18:03:26 +00004321 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322
Tim Peters602f7402002-04-27 18:03:26 +00004323 if (size <= MAX_SHORT_UNICHARS) {
4324 /* Write into the stack buffer; nallocated can't overflow.
4325 * At the end, we'll allocate exactly as much heap space as it
4326 * turns out we need.
4327 */
4328 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004329 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004330 p = stackbuf;
4331 }
4332 else {
4333 /* Overallocate on the heap, and give the excess back at the end. */
4334 nallocated = size * 4;
4335 if (nallocated / 4 != size) /* overflow! */
4336 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004337 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004338 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004339 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004340 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004341 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004342
Tim Peters602f7402002-04-27 18:03:26 +00004343 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004345
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004346 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004347 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004349
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004351 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004352 *p++ = (char)(0xc0 | (ch >> 6));
4353 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004354 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004355 Py_ssize_t newpos;
4356 PyObject *rep;
4357 Py_ssize_t repsize, k, startpos;
4358 startpos = i-1;
4359#if SIZEOF_WCHAR_T == 2
4360 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004361#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 rep = unicode_encode_call_errorhandler(
4363 errors, &errorHandler, "utf-8", "surrogates not allowed",
4364 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4365 &exc, startpos, startpos+1, &newpos);
4366 if (!rep)
4367 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 if (PyBytes_Check(rep))
4370 repsize = PyBytes_GET_SIZE(rep);
4371 else
4372 repsize = PyUnicode_GET_SIZE(rep);
4373
4374 if (repsize > 4) {
4375 Py_ssize_t offset;
4376
4377 if (result == NULL)
4378 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004379 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4383 /* integer overflow */
4384 PyErr_NoMemory();
4385 goto error;
4386 }
4387 nallocated += repsize - 4;
4388 if (result != NULL) {
4389 if (_PyBytes_Resize(&result, nallocated) < 0)
4390 goto error;
4391 } else {
4392 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004393 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004394 goto error;
4395 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4396 }
4397 p = PyBytes_AS_STRING(result) + offset;
4398 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004400 if (PyBytes_Check(rep)) {
4401 char *prep = PyBytes_AS_STRING(rep);
4402 for(k = repsize; k > 0; k--)
4403 *p++ = *prep++;
4404 } else /* rep is unicode */ {
4405 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4406 Py_UNICODE c;
4407
4408 for(k=0; k<repsize; k++) {
4409 c = prep[k];
4410 if (0x80 <= c) {
4411 raise_encode_exception(&exc, "utf-8",
4412 PyUnicode_AS_UNICODE(unicode),
4413 size, i-1, i,
4414 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004415 goto error;
4416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004418 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004421 } else if (ch < 0x10000) {
4422 *p++ = (char)(0xe0 | (ch >> 12));
4423 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4424 *p++ = (char)(0x80 | (ch & 0x3f));
4425 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004426 /* Encode UCS4 Unicode ordinals */
4427 *p++ = (char)(0xf0 | (ch >> 18));
4428 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4429 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4430 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004431#if SIZEOF_WCHAR_T == 2
4432 wchar_offset++;
4433#endif
Tim Peters602f7402002-04-27 18:03:26 +00004434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004436
Guido van Rossum98297ee2007-11-06 21:34:58 +00004437 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004438 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004439 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004440 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004441 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004442 }
4443 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004444 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004445 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004446 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004447 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004449
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004450 Py_XDECREF(errorHandler);
4451 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004452 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004453 error:
4454 Py_XDECREF(errorHandler);
4455 Py_XDECREF(exc);
4456 Py_XDECREF(result);
4457 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004458
Tim Peters602f7402002-04-27 18:03:26 +00004459#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460}
4461
Alexander Belopolsky40018472011-02-26 01:02:56 +00004462PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4464 Py_ssize_t size,
4465 const char *errors)
4466{
4467 PyObject *v, *unicode;
4468
4469 unicode = PyUnicode_FromUnicode(s, size);
4470 if (unicode == NULL)
4471 return NULL;
4472 v = _PyUnicode_AsUTF8String(unicode, errors);
4473 Py_DECREF(unicode);
4474 return v;
4475}
4476
4477PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004478PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004480 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481}
4482
Walter Dörwald41980ca2007-08-16 21:55:45 +00004483/* --- UTF-32 Codec ------------------------------------------------------- */
4484
4485PyObject *
4486PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 Py_ssize_t size,
4488 const char *errors,
4489 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004490{
4491 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4492}
4493
4494PyObject *
4495PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 Py_ssize_t size,
4497 const char *errors,
4498 int *byteorder,
4499 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004500{
4501 const char *starts = s;
4502 Py_ssize_t startinpos;
4503 Py_ssize_t endinpos;
4504 Py_ssize_t outpos;
4505 PyUnicodeObject *unicode;
4506 Py_UNICODE *p;
4507#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004508 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004509 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004510#else
4511 const int pairs = 0;
4512#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004513 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004514 int bo = 0; /* assume native ordering by default */
4515 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004516 /* Offsets from q for retrieving bytes in the right order. */
4517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4518 int iorder[] = {0, 1, 2, 3};
4519#else
4520 int iorder[] = {3, 2, 1, 0};
4521#endif
4522 PyObject *errorHandler = NULL;
4523 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004524
Walter Dörwald41980ca2007-08-16 21:55:45 +00004525 q = (unsigned char *)s;
4526 e = q + size;
4527
4528 if (byteorder)
4529 bo = *byteorder;
4530
4531 /* Check for BOM marks (U+FEFF) in the input and adjust current
4532 byte order setting accordingly. In native mode, the leading BOM
4533 mark is skipped, in all other modes, it is copied to the output
4534 stream as-is (giving a ZWNBSP character). */
4535 if (bo == 0) {
4536 if (size >= 4) {
4537 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004538 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004539#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 if (bom == 0x0000FEFF) {
4541 q += 4;
4542 bo = -1;
4543 }
4544 else if (bom == 0xFFFE0000) {
4545 q += 4;
4546 bo = 1;
4547 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004548#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 if (bom == 0x0000FEFF) {
4550 q += 4;
4551 bo = 1;
4552 }
4553 else if (bom == 0xFFFE0000) {
4554 q += 4;
4555 bo = -1;
4556 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004557#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004559 }
4560
4561 if (bo == -1) {
4562 /* force LE */
4563 iorder[0] = 0;
4564 iorder[1] = 1;
4565 iorder[2] = 2;
4566 iorder[3] = 3;
4567 }
4568 else if (bo == 1) {
4569 /* force BE */
4570 iorder[0] = 3;
4571 iorder[1] = 2;
4572 iorder[2] = 1;
4573 iorder[3] = 0;
4574 }
4575
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004576 /* On narrow builds we split characters outside the BMP into two
4577 codepoints => count how much extra space we need. */
4578#ifndef Py_UNICODE_WIDE
4579 for (qq = q; qq < e; qq += 4)
4580 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4581 pairs++;
4582#endif
4583
4584 /* This might be one to much, because of a BOM */
4585 unicode = _PyUnicode_New((size+3)/4+pairs);
4586 if (!unicode)
4587 return NULL;
4588 if (size == 0)
4589 return (PyObject *)unicode;
4590
4591 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004592 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004593
Walter Dörwald41980ca2007-08-16 21:55:45 +00004594 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 Py_UCS4 ch;
4596 /* remaining bytes at the end? (size should be divisible by 4) */
4597 if (e-q<4) {
4598 if (consumed)
4599 break;
4600 errmsg = "truncated data";
4601 startinpos = ((const char *)q)-starts;
4602 endinpos = ((const char *)e)-starts;
4603 goto utf32Error;
4604 /* The remaining input chars are ignored if the callback
4605 chooses to skip the input */
4606 }
4607 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4608 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004609
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 if (ch >= 0x110000)
4611 {
4612 errmsg = "codepoint not in range(0x110000)";
4613 startinpos = ((const char *)q)-starts;
4614 endinpos = startinpos+4;
4615 goto utf32Error;
4616 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004617#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 if (ch >= 0x10000)
4619 {
4620 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4621 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4622 }
4623 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004624#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 *p++ = ch;
4626 q += 4;
4627 continue;
4628 utf32Error:
4629 outpos = p-PyUnicode_AS_UNICODE(unicode);
4630 if (unicode_decode_call_errorhandler(
4631 errors, &errorHandler,
4632 "utf32", errmsg,
4633 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4634 &unicode, &outpos, &p))
4635 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004636 }
4637
4638 if (byteorder)
4639 *byteorder = bo;
4640
4641 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004642 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004643
4644 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004645 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004646 goto onError;
4647
4648 Py_XDECREF(errorHandler);
4649 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650 if (PyUnicode_READY(unicode) == -1) {
4651 Py_DECREF(unicode);
4652 return NULL;
4653 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004654 return (PyObject *)unicode;
4655
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004657 Py_DECREF(unicode);
4658 Py_XDECREF(errorHandler);
4659 Py_XDECREF(exc);
4660 return NULL;
4661}
4662
4663PyObject *
4664PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004665 Py_ssize_t size,
4666 const char *errors,
4667 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004668{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004669 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004670 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004671 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004672#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004673 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004674#else
4675 const int pairs = 0;
4676#endif
4677 /* Offsets from p for storing byte pairs in the right order. */
4678#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4679 int iorder[] = {0, 1, 2, 3};
4680#else
4681 int iorder[] = {3, 2, 1, 0};
4682#endif
4683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684#define STORECHAR(CH) \
4685 do { \
4686 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4687 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4688 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4689 p[iorder[0]] = (CH) & 0xff; \
4690 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691 } while(0)
4692
4693 /* In narrow builds we can output surrogate pairs as one codepoint,
4694 so we need less space. */
4695#ifndef Py_UNICODE_WIDE
4696 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004697 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4698 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4699 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004701 nsize = (size - pairs + (byteorder == 0));
4702 bytesize = nsize * 4;
4703 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004705 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004706 if (v == NULL)
4707 return NULL;
4708
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004709 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004710 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004711 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004712 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004713 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004714
4715 if (byteorder == -1) {
4716 /* force LE */
4717 iorder[0] = 0;
4718 iorder[1] = 1;
4719 iorder[2] = 2;
4720 iorder[3] = 3;
4721 }
4722 else if (byteorder == 1) {
4723 /* force BE */
4724 iorder[0] = 3;
4725 iorder[1] = 2;
4726 iorder[2] = 1;
4727 iorder[3] = 0;
4728 }
4729
4730 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4734 Py_UCS4 ch2 = *s;
4735 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4736 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4737 s++;
4738 size--;
4739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004740 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741#endif
4742 STORECHAR(ch);
4743 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004744
4745 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004746 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004747#undef STORECHAR
4748}
4749
Alexander Belopolsky40018472011-02-26 01:02:56 +00004750PyObject *
4751PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752{
4753 if (!PyUnicode_Check(unicode)) {
4754 PyErr_BadArgument();
4755 return NULL;
4756 }
4757 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 PyUnicode_GET_SIZE(unicode),
4759 NULL,
4760 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004761}
4762
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763/* --- UTF-16 Codec ------------------------------------------------------- */
4764
Tim Peters772747b2001-08-09 22:21:55 +00004765PyObject *
4766PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 Py_ssize_t size,
4768 const char *errors,
4769 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770{
Walter Dörwald69652032004-09-07 20:24:22 +00004771 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4772}
4773
Antoine Pitrouab868312009-01-10 15:40:25 +00004774/* Two masks for fast checking of whether a C 'long' may contain
4775 UTF16-encoded surrogate characters. This is an efficient heuristic,
4776 assuming that non-surrogate characters with a code point >= 0x8000 are
4777 rare in most input.
4778 FAST_CHAR_MASK is used when the input is in native byte ordering,
4779 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004780*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004781#if (SIZEOF_LONG == 8)
4782# define FAST_CHAR_MASK 0x8000800080008000L
4783# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4784#elif (SIZEOF_LONG == 4)
4785# define FAST_CHAR_MASK 0x80008000L
4786# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4787#else
4788# error C 'long' size should be either 4 or 8!
4789#endif
4790
Walter Dörwald69652032004-09-07 20:24:22 +00004791PyObject *
4792PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 Py_ssize_t size,
4794 const char *errors,
4795 int *byteorder,
4796 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004799 Py_ssize_t startinpos;
4800 Py_ssize_t endinpos;
4801 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 PyUnicodeObject *unicode;
4803 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004804 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004805 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004806 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004807 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004808 /* Offsets from q for retrieving byte pairs in the right order. */
4809#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4810 int ihi = 1, ilo = 0;
4811#else
4812 int ihi = 0, ilo = 1;
4813#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814 PyObject *errorHandler = NULL;
4815 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816
4817 /* Note: size will always be longer than the resulting Unicode
4818 character count */
4819 unicode = _PyUnicode_New(size);
4820 if (!unicode)
4821 return NULL;
4822 if (size == 0)
4823 return (PyObject *)unicode;
4824
4825 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004827 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004828 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829
4830 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004831 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004833 /* Check for BOM marks (U+FEFF) in the input and adjust current
4834 byte order setting accordingly. In native mode, the leading BOM
4835 mark is skipped, in all other modes, it is copied to the output
4836 stream as-is (giving a ZWNBSP character). */
4837 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004838 if (size >= 2) {
4839 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004840#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 if (bom == 0xFEFF) {
4842 q += 2;
4843 bo = -1;
4844 }
4845 else if (bom == 0xFFFE) {
4846 q += 2;
4847 bo = 1;
4848 }
Tim Petersced69f82003-09-16 20:30:58 +00004849#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 if (bom == 0xFEFF) {
4851 q += 2;
4852 bo = 1;
4853 }
4854 else if (bom == 0xFFFE) {
4855 q += 2;
4856 bo = -1;
4857 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004858#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861
Tim Peters772747b2001-08-09 22:21:55 +00004862 if (bo == -1) {
4863 /* force LE */
4864 ihi = 1;
4865 ilo = 0;
4866 }
4867 else if (bo == 1) {
4868 /* force BE */
4869 ihi = 0;
4870 ilo = 1;
4871 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004872#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4873 native_ordering = ilo < ihi;
4874#else
4875 native_ordering = ilo > ihi;
4876#endif
Tim Peters772747b2001-08-09 22:21:55 +00004877
Antoine Pitrouab868312009-01-10 15:40:25 +00004878 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004879 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004881 /* First check for possible aligned read of a C 'long'. Unaligned
4882 reads are more expensive, better to defer to another iteration. */
4883 if (!((size_t) q & LONG_PTR_MASK)) {
4884 /* Fast path for runs of non-surrogate chars. */
4885 register const unsigned char *_q = q;
4886 Py_UNICODE *_p = p;
4887 if (native_ordering) {
4888 /* Native ordering is simple: as long as the input cannot
4889 possibly contain a surrogate char, do an unrolled copy
4890 of several 16-bit code points to the target object.
4891 The non-surrogate check is done on several input bytes
4892 at a time (as many as a C 'long' can contain). */
4893 while (_q < aligned_end) {
4894 unsigned long data = * (unsigned long *) _q;
4895 if (data & FAST_CHAR_MASK)
4896 break;
4897 _p[0] = ((unsigned short *) _q)[0];
4898 _p[1] = ((unsigned short *) _q)[1];
4899#if (SIZEOF_LONG == 8)
4900 _p[2] = ((unsigned short *) _q)[2];
4901 _p[3] = ((unsigned short *) _q)[3];
4902#endif
4903 _q += SIZEOF_LONG;
4904 _p += SIZEOF_LONG / 2;
4905 }
4906 }
4907 else {
4908 /* Byteswapped ordering is similar, but we must decompose
4909 the copy bytewise, and take care of zero'ing out the
4910 upper bytes if the target object is in 32-bit units
4911 (that is, in UCS-4 builds). */
4912 while (_q < aligned_end) {
4913 unsigned long data = * (unsigned long *) _q;
4914 if (data & SWAPPED_FAST_CHAR_MASK)
4915 break;
4916 /* Zero upper bytes in UCS-4 builds */
4917#if (Py_UNICODE_SIZE > 2)
4918 _p[0] = 0;
4919 _p[1] = 0;
4920#if (SIZEOF_LONG == 8)
4921 _p[2] = 0;
4922 _p[3] = 0;
4923#endif
4924#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004925 /* Issue #4916; UCS-4 builds on big endian machines must
4926 fill the two last bytes of each 4-byte unit. */
4927#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4928# define OFF 2
4929#else
4930# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004931#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004932 ((unsigned char *) _p)[OFF + 1] = _q[0];
4933 ((unsigned char *) _p)[OFF + 0] = _q[1];
4934 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4935 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4936#if (SIZEOF_LONG == 8)
4937 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4938 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4939 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4940 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4941#endif
4942#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004943 _q += SIZEOF_LONG;
4944 _p += SIZEOF_LONG / 2;
4945 }
4946 }
4947 p = _p;
4948 q = _q;
4949 if (q >= e)
4950 break;
4951 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004953
Benjamin Peterson14339b62009-01-31 16:36:08 +00004954 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004955
4956 if (ch < 0xD800 || ch > 0xDFFF) {
4957 *p++ = ch;
4958 continue;
4959 }
4960
4961 /* UTF-16 code pair: */
4962 if (q > e) {
4963 errmsg = "unexpected end of data";
4964 startinpos = (((const char *)q) - 2) - starts;
4965 endinpos = ((const char *)e) + 1 - starts;
4966 goto utf16Error;
4967 }
4968 if (0xD800 <= ch && ch <= 0xDBFF) {
4969 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4970 q += 2;
4971 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004972#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 *p++ = ch;
4974 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004975#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004977#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 continue;
4979 }
4980 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004981 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 startinpos = (((const char *)q)-4)-starts;
4983 endinpos = startinpos+2;
4984 goto utf16Error;
4985 }
4986
Benjamin Peterson14339b62009-01-31 16:36:08 +00004987 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 errmsg = "illegal encoding";
4989 startinpos = (((const char *)q)-2)-starts;
4990 endinpos = startinpos+2;
4991 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004992
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 utf16Error:
4994 outpos = p - PyUnicode_AS_UNICODE(unicode);
4995 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004996 errors,
4997 &errorHandler,
4998 "utf16", errmsg,
4999 &starts,
5000 (const char **)&e,
5001 &startinpos,
5002 &endinpos,
5003 &exc,
5004 (const char **)&q,
5005 &unicode,
5006 &outpos,
5007 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005009 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005010 /* remaining byte at the end? (size should be even) */
5011 if (e == q) {
5012 if (!consumed) {
5013 errmsg = "truncated data";
5014 startinpos = ((const char *)q) - starts;
5015 endinpos = ((const char *)e) + 1 - starts;
5016 outpos = p - PyUnicode_AS_UNICODE(unicode);
5017 if (unicode_decode_call_errorhandler(
5018 errors,
5019 &errorHandler,
5020 "utf16", errmsg,
5021 &starts,
5022 (const char **)&e,
5023 &startinpos,
5024 &endinpos,
5025 &exc,
5026 (const char **)&q,
5027 &unicode,
5028 &outpos,
5029 &p))
5030 goto onError;
5031 /* The remaining input chars are ignored if the callback
5032 chooses to skip the input */
5033 }
5034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035
5036 if (byteorder)
5037 *byteorder = bo;
5038
Walter Dörwald69652032004-09-07 20:24:22 +00005039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005041
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005043 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 goto onError;
5045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 if (PyUnicode_READY(unicode) == -1) {
5049 Py_DECREF(unicode);
5050 return NULL;
5051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 return (PyObject *)unicode;
5053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 return NULL;
5059}
5060
Antoine Pitrouab868312009-01-10 15:40:25 +00005061#undef FAST_CHAR_MASK
5062#undef SWAPPED_FAST_CHAR_MASK
5063
Tim Peters772747b2001-08-09 22:21:55 +00005064PyObject *
5065PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 Py_ssize_t size,
5067 const char *errors,
5068 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005070 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005071 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005072 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005073#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005074 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005075#else
5076 const int pairs = 0;
5077#endif
Tim Peters772747b2001-08-09 22:21:55 +00005078 /* Offsets from p for storing byte pairs in the right order. */
5079#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5080 int ihi = 1, ilo = 0;
5081#else
5082 int ihi = 0, ilo = 1;
5083#endif
5084
Benjamin Peterson29060642009-01-31 22:14:21 +00005085#define STORECHAR(CH) \
5086 do { \
5087 p[ihi] = ((CH) >> 8) & 0xff; \
5088 p[ilo] = (CH) & 0xff; \
5089 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005090 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005092#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005093 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 if (s[i] >= 0x10000)
5095 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005096#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005097 /* 2 * (size + pairs + (byteorder == 0)) */
5098 if (size > PY_SSIZE_T_MAX ||
5099 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005101 nsize = size + pairs + (byteorder == 0);
5102 bytesize = nsize * 2;
5103 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005105 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 if (v == NULL)
5107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005112 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005113 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005114
5115 if (byteorder == -1) {
5116 /* force LE */
5117 ihi = 1;
5118 ilo = 0;
5119 }
5120 else if (byteorder == 1) {
5121 /* force BE */
5122 ihi = 0;
5123 ilo = 1;
5124 }
5125
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005126 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 Py_UNICODE ch = *s++;
5128 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005129#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 if (ch >= 0x10000) {
5131 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5132 ch = 0xD800 | ((ch-0x10000) >> 10);
5133 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005134#endif
Tim Peters772747b2001-08-09 22:21:55 +00005135 STORECHAR(ch);
5136 if (ch2)
5137 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005138 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005139
5140 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005141 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005142#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143}
5144
Alexander Belopolsky40018472011-02-26 01:02:56 +00005145PyObject *
5146PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147{
5148 if (!PyUnicode_Check(unicode)) {
5149 PyErr_BadArgument();
5150 return NULL;
5151 }
5152 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005153 PyUnicode_GET_SIZE(unicode),
5154 NULL,
5155 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156}
5157
5158/* --- Unicode Escape Codec ----------------------------------------------- */
5159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005160/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5161 if all the escapes in the string make it still a valid ASCII string.
5162 Returns -1 if any escapes were found which cause the string to
5163 pop out of ASCII range. Otherwise returns the length of the
5164 required buffer to hold the string.
5165 */
5166Py_ssize_t
5167length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5168{
5169 const unsigned char *p = (const unsigned char *)s;
5170 const unsigned char *end = p + size;
5171 Py_ssize_t length = 0;
5172
5173 if (size < 0)
5174 return -1;
5175
5176 for (; p < end; ++p) {
5177 if (*p > 127) {
5178 /* Non-ASCII */
5179 return -1;
5180 }
5181 else if (*p != '\\') {
5182 /* Normal character */
5183 ++length;
5184 }
5185 else {
5186 /* Backslash-escape, check next char */
5187 ++p;
5188 /* Escape sequence reaches till end of string or
5189 non-ASCII follow-up. */
5190 if (p >= end || *p > 127)
5191 return -1;
5192 switch (*p) {
5193 case '\n':
5194 /* backslash + \n result in zero characters */
5195 break;
5196 case '\\': case '\'': case '\"':
5197 case 'b': case 'f': case 't':
5198 case 'n': case 'r': case 'v': case 'a':
5199 ++length;
5200 break;
5201 case '0': case '1': case '2': case '3':
5202 case '4': case '5': case '6': case '7':
5203 case 'x': case 'u': case 'U': case 'N':
5204 /* these do not guarantee ASCII characters */
5205 return -1;
5206 default:
5207 /* count the backslash + the other character */
5208 length += 2;
5209 }
5210 }
5211 }
5212 return length;
5213}
5214
5215/* Similar to PyUnicode_WRITE but either write into wstr field
5216 or treat string as ASCII. */
5217#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5218 do { \
5219 if ((kind) != PyUnicode_WCHAR_KIND) \
5220 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5221 else \
5222 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5223 } while (0)
5224
5225#define WRITE_WSTR(buf, index, value) \
5226 assert(kind == PyUnicode_WCHAR_KIND), \
5227 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5228
5229
Fredrik Lundh06d12682001-01-24 07:59:11 +00005230static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005231
Alexander Belopolsky40018472011-02-26 01:02:56 +00005232PyObject *
5233PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005234 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005240 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005244 char* message;
5245 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005248 Py_ssize_t ascii_length;
5249 Py_ssize_t i;
5250 int kind;
5251 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005253 ascii_length = length_of_escaped_ascii_string(s, size);
5254
5255 /* After length_of_escaped_ascii_string() there are two alternatives,
5256 either the string is pure ASCII with named escapes like \n, etc.
5257 and we determined it's exact size (common case)
5258 or it contains \x, \u, ... escape sequences. then we create a
5259 legacy wchar string and resize it at the end of this function. */
5260 if (ascii_length >= 0) {
5261 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5262 if (!v)
5263 goto onError;
5264 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5265 kind = PyUnicode_1BYTE_KIND;
5266 data = PyUnicode_DATA(v);
5267 }
5268 else {
5269 /* Escaped strings will always be longer than the resulting
5270 Unicode string, so we start with size here and then reduce the
5271 length after conversion to the true value.
5272 (but if the error callback returns a long replacement string
5273 we'll have to allocate more space) */
5274 v = _PyUnicode_New(size);
5275 if (!v)
5276 goto onError;
5277 kind = PyUnicode_WCHAR_KIND;
5278 data = PyUnicode_AS_UNICODE(v);
5279 }
5280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 if (size == 0)
5282 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005283 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 while (s < end) {
5287 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005288 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005289 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005291 if (kind == PyUnicode_WCHAR_KIND) {
5292 assert(i < _PyUnicode_WSTR_LENGTH(v));
5293 }
5294 else {
5295 /* The only case in which i == ascii_length is a backslash
5296 followed by a newline. */
5297 assert(i <= ascii_length);
5298 }
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 /* Non-escape characters are interpreted as Unicode ordinals */
5301 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005302 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 continue;
5304 }
5305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005306 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 /* \ - Escapes */
5308 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005309 c = *s++;
5310 if (s > end)
5311 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005312
5313 if (kind == PyUnicode_WCHAR_KIND) {
5314 assert(i < _PyUnicode_WSTR_LENGTH(v));
5315 }
5316 else {
5317 /* The only case in which i == ascii_length is a backslash
5318 followed by a newline. */
5319 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5320 }
5321
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005322 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005326 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5327 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5328 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5329 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5330 /* FF */
5331 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5332 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5333 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5334 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5335 /* VT */
5336 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5337 /* BEL, not classic C */
5338 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 case '0': case '1': case '2': case '3':
5342 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005343 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005344 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005345 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005346 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005347 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005349 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 break;
5351
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 /* hex escapes */
5353 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005355 digits = 2;
5356 message = "truncated \\xXX escape";
5357 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005361 digits = 4;
5362 message = "truncated \\uXXXX escape";
5363 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005366 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005367 digits = 8;
5368 message = "truncated \\UXXXXXXXX escape";
5369 hexescape:
5370 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 if (s+digits>end) {
5373 endinpos = size;
5374 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 errors, &errorHandler,
5376 "unicodeescape", "end of string in escape sequence",
5377 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005380 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005381 goto nextByte;
5382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 for (j = 0; j < digits; ++j) {
5384 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005385 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 endinpos = (s+j+1)-starts;
5387 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 errors, &errorHandler,
5390 "unicodeescape", message,
5391 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005392 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005393 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005394 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005396 }
5397 chr = (chr<<4) & ~0xF;
5398 if (c >= '0' && c <= '9')
5399 chr += c - '0';
5400 else if (c >= 'a' && c <= 'f')
5401 chr += 10 + c - 'a';
5402 else
5403 chr += 10 + c - 'A';
5404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005406 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005407 /* _decoding_error will have already written into the
5408 target buffer. */
5409 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005410 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005411 /* when we get here, chr is a 32-bit unicode character */
5412 if (chr <= 0xffff)
5413 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005414 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005415 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005416 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005417 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005418#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005419 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005420#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005421 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5423 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005424#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005425 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005427 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005428 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 errors, &errorHandler,
5430 "unicodeescape", "illegal Unicode character",
5431 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005432 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005433 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005434 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005435 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005436 break;
5437
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005439 case 'N':
5440 message = "malformed \\N character escape";
5441 if (ucnhash_CAPI == NULL) {
5442 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5444 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005445 if (ucnhash_CAPI == NULL)
5446 goto ucnhashError;
5447 }
5448 if (*s == '{') {
5449 const char *start = s+1;
5450 /* look for the closing brace */
5451 while (*s != '}' && s < end)
5452 s++;
5453 if (s > start && s < end && *s == '}') {
5454 /* found a name. look it up in the unicode database */
5455 message = "unknown Unicode character name";
5456 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5458 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005459 goto store;
5460 }
5461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 errors, &errorHandler,
5466 "unicodeescape", message,
5467 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005468 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005469 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005471 break;
5472
5473 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005474 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005476 message = "\\ at end of string";
5477 s--;
5478 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005480 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 errors, &errorHandler,
5482 "unicodeescape", message,
5483 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005485 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005486 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005487 }
5488 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5490 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005491 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005492 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005495 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005497 /* Ensure the length prediction worked in case of ASCII strings */
5498 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5499
Victor Stinnerfe226c02011-10-03 03:52:20 +02005500 if (kind == PyUnicode_WCHAR_KIND)
5501 {
5502 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5503 goto onError;
5504 if (PyUnicode_READY(v) == -1)
5505 goto onError;
5506 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005507 Py_XDECREF(errorHandler);
5508 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005510
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005512 PyErr_SetString(
5513 PyExc_UnicodeError,
5514 "\\N escapes not supported (can't load unicodedata module)"
5515 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005516 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 Py_XDECREF(errorHandler);
5518 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005519 return NULL;
5520
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 Py_XDECREF(errorHandler);
5524 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 return NULL;
5526}
5527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005528#undef WRITE_ASCII_OR_WSTR
5529#undef WRITE_WSTR
5530
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531/* Return a Unicode-Escape string version of the Unicode object.
5532
5533 If quotes is true, the string is enclosed in u"" or u'' quotes as
5534 appropriate.
5535
5536*/
5537
Walter Dörwald79e913e2007-05-12 11:08:06 +00005538static const char *hexdigits = "0123456789abcdef";
5539
Alexander Belopolsky40018472011-02-26 01:02:56 +00005540PyObject *
5541PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005542 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005543{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005544 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005547#ifdef Py_UNICODE_WIDE
5548 const Py_ssize_t expandsize = 10;
5549#else
5550 const Py_ssize_t expandsize = 6;
5551#endif
5552
Thomas Wouters89f507f2006-12-13 04:49:30 +00005553 /* XXX(nnorwitz): rather than over-allocating, it would be
5554 better to choose a different scheme. Perhaps scan the
5555 first N-chars of the string and allocate based on that size.
5556 */
5557 /* Initial allocation is based on the longest-possible unichr
5558 escape.
5559
5560 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5561 unichr, so in this case it's the longest unichr escape. In
5562 narrow (UTF-16) builds this is five chars per source unichr
5563 since there are two unichrs in the surrogate pair, so in narrow
5564 (UTF-16) builds it's not the longest unichr escape.
5565
5566 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5567 so in the narrow (UTF-16) build case it's the longest unichr
5568 escape.
5569 */
5570
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005571 if (size == 0)
5572 return PyBytes_FromStringAndSize(NULL, 0);
5573
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005574 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005576
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005577 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 2
5579 + expandsize*size
5580 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581 if (repr == NULL)
5582 return NULL;
5583
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005584 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 while (size-- > 0) {
5587 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005588
Walter Dörwald79e913e2007-05-12 11:08:06 +00005589 /* Escape backslashes */
5590 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 *p++ = '\\';
5592 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005593 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005594 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005595
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005596#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005597 /* Map 21-bit characters to '\U00xxxxxx' */
5598 else if (ch >= 0x10000) {
5599 *p++ = '\\';
5600 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005601 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5602 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5603 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5604 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5605 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5606 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5607 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5608 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005610 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005611#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005612 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5613 else if (ch >= 0xD800 && ch < 0xDC00) {
5614 Py_UNICODE ch2;
5615 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 ch2 = *s++;
5618 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005619 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5621 *p++ = '\\';
5622 *p++ = 'U';
5623 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5624 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5625 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5626 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5627 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5628 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5629 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5630 *p++ = hexdigits[ucs & 0x0000000F];
5631 continue;
5632 }
5633 /* Fall through: isolated surrogates are copied as-is */
5634 s--;
5635 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005636 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005637#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005638
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005640 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 *p++ = '\\';
5642 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005643 *p++ = hexdigits[(ch >> 12) & 0x000F];
5644 *p++ = hexdigits[(ch >> 8) & 0x000F];
5645 *p++ = hexdigits[(ch >> 4) & 0x000F];
5646 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005648
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005649 /* Map special whitespace to '\t', \n', '\r' */
5650 else if (ch == '\t') {
5651 *p++ = '\\';
5652 *p++ = 't';
5653 }
5654 else if (ch == '\n') {
5655 *p++ = '\\';
5656 *p++ = 'n';
5657 }
5658 else if (ch == '\r') {
5659 *p++ = '\\';
5660 *p++ = 'r';
5661 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005662
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005663 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005664 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005666 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005667 *p++ = hexdigits[(ch >> 4) & 0x000F];
5668 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005669 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 /* Copy everything else as-is */
5672 else
5673 *p++ = (char) ch;
5674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 assert(p - PyBytes_AS_STRING(repr) > 0);
5677 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5678 return NULL;
5679 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680}
5681
Alexander Belopolsky40018472011-02-26 01:02:56 +00005682PyObject *
5683PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005685 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 if (!PyUnicode_Check(unicode)) {
5687 PyErr_BadArgument();
5688 return NULL;
5689 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005690 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5691 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005692 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693}
5694
5695/* --- Raw Unicode Escape Codec ------------------------------------------- */
5696
Alexander Belopolsky40018472011-02-26 01:02:56 +00005697PyObject *
5698PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005699 Py_ssize_t size,
5700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005702 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005703 Py_ssize_t startinpos;
5704 Py_ssize_t endinpos;
5705 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 const char *end;
5709 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 PyObject *errorHandler = NULL;
5711 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005712
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 /* Escaped strings will always be longer than the resulting
5714 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 length after conversion to the true value. (But decoding error
5716 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 v = _PyUnicode_New(size);
5718 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 end = s + size;
5724 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 unsigned char c;
5726 Py_UCS4 x;
5727 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005728 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* Non-escape characters are interpreted as Unicode ordinals */
5731 if (*s != '\\') {
5732 *p++ = (unsigned char)*s++;
5733 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005734 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 startinpos = s-starts;
5736
5737 /* \u-escapes are only interpreted iff the number of leading
5738 backslashes if odd */
5739 bs = s;
5740 for (;s < end;) {
5741 if (*s != '\\')
5742 break;
5743 *p++ = (unsigned char)*s++;
5744 }
5745 if (((s - bs) & 1) == 0 ||
5746 s >= end ||
5747 (*s != 'u' && *s != 'U')) {
5748 continue;
5749 }
5750 p--;
5751 count = *s=='u' ? 4 : 8;
5752 s++;
5753
5754 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5755 outpos = p-PyUnicode_AS_UNICODE(v);
5756 for (x = 0, i = 0; i < count; ++i, ++s) {
5757 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005758 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 endinpos = s-starts;
5760 if (unicode_decode_call_errorhandler(
5761 errors, &errorHandler,
5762 "rawunicodeescape", "truncated \\uXXXX",
5763 &starts, &end, &startinpos, &endinpos, &exc, &s,
5764 &v, &outpos, &p))
5765 goto onError;
5766 goto nextByte;
5767 }
5768 x = (x<<4) & ~0xF;
5769 if (c >= '0' && c <= '9')
5770 x += c - '0';
5771 else if (c >= 'a' && c <= 'f')
5772 x += 10 + c - 'a';
5773 else
5774 x += 10 + c - 'A';
5775 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005776 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 /* UCS-2 character */
5778 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005779 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* UCS-4 character. Either store directly, or as
5781 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005782#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005784#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 x -= 0x10000L;
5786 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5787 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005788#endif
5789 } else {
5790 endinpos = s-starts;
5791 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005792 if (unicode_decode_call_errorhandler(
5793 errors, &errorHandler,
5794 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 &starts, &end, &startinpos, &endinpos, &exc, &s,
5796 &v, &outpos, &p))
5797 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005798 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 nextByte:
5800 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005802 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 Py_XDECREF(errorHandler);
5805 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 if (PyUnicode_READY(v) == -1) {
5807 Py_DECREF(v);
5808 return NULL;
5809 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005811
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 Py_XDECREF(errorHandler);
5815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 return NULL;
5817}
5818
Alexander Belopolsky40018472011-02-26 01:02:56 +00005819PyObject *
5820PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005821 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005823 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 char *p;
5825 char *q;
5826
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005827#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005828 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005829#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005830 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005831#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005833 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005835
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005836 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837 if (repr == NULL)
5838 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005839 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005840 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005842 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 while (size-- > 0) {
5844 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005845#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 /* Map 32-bit characters to '\Uxxxxxxxx' */
5847 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005848 *p++ = '\\';
5849 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005850 *p++ = hexdigits[(ch >> 28) & 0xf];
5851 *p++ = hexdigits[(ch >> 24) & 0xf];
5852 *p++ = hexdigits[(ch >> 20) & 0xf];
5853 *p++ = hexdigits[(ch >> 16) & 0xf];
5854 *p++ = hexdigits[(ch >> 12) & 0xf];
5855 *p++ = hexdigits[(ch >> 8) & 0xf];
5856 *p++ = hexdigits[(ch >> 4) & 0xf];
5857 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005858 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005859 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005860#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5862 if (ch >= 0xD800 && ch < 0xDC00) {
5863 Py_UNICODE ch2;
5864 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005865
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 ch2 = *s++;
5867 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005868 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5870 *p++ = '\\';
5871 *p++ = 'U';
5872 *p++ = hexdigits[(ucs >> 28) & 0xf];
5873 *p++ = hexdigits[(ucs >> 24) & 0xf];
5874 *p++ = hexdigits[(ucs >> 20) & 0xf];
5875 *p++ = hexdigits[(ucs >> 16) & 0xf];
5876 *p++ = hexdigits[(ucs >> 12) & 0xf];
5877 *p++ = hexdigits[(ucs >> 8) & 0xf];
5878 *p++ = hexdigits[(ucs >> 4) & 0xf];
5879 *p++ = hexdigits[ucs & 0xf];
5880 continue;
5881 }
5882 /* Fall through: isolated surrogates are copied as-is */
5883 s--;
5884 size++;
5885 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005886#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 /* Map 16-bit characters to '\uxxxx' */
5888 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 *p++ = '\\';
5890 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005891 *p++ = hexdigits[(ch >> 12) & 0xf];
5892 *p++ = hexdigits[(ch >> 8) & 0xf];
5893 *p++ = hexdigits[(ch >> 4) & 0xf];
5894 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 /* Copy everything else as-is */
5897 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 *p++ = (char) ch;
5899 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005900 size = p - q;
5901
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005902 assert(size > 0);
5903 if (_PyBytes_Resize(&repr, size) < 0)
5904 return NULL;
5905 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906}
5907
Alexander Belopolsky40018472011-02-26 01:02:56 +00005908PyObject *
5909PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005911 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005913 PyErr_BadArgument();
5914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005916 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5917 PyUnicode_GET_SIZE(unicode));
5918
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005919 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920}
5921
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005922/* --- Unicode Internal Codec ------------------------------------------- */
5923
Alexander Belopolsky40018472011-02-26 01:02:56 +00005924PyObject *
5925_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005926 Py_ssize_t size,
5927 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005928{
5929 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005930 Py_ssize_t startinpos;
5931 Py_ssize_t endinpos;
5932 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005933 PyUnicodeObject *v;
5934 Py_UNICODE *p;
5935 const char *end;
5936 const char *reason;
5937 PyObject *errorHandler = NULL;
5938 PyObject *exc = NULL;
5939
Neal Norwitzd43069c2006-01-08 01:12:10 +00005940#ifdef Py_UNICODE_WIDE
5941 Py_UNICODE unimax = PyUnicode_GetMax();
5942#endif
5943
Thomas Wouters89f507f2006-12-13 04:49:30 +00005944 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005945 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5946 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005948 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5949 as string was created with the old API. */
5950 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005952 p = PyUnicode_AS_UNICODE(v);
5953 end = s + size;
5954
5955 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005956 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005957 /* We have to sanity check the raw data, otherwise doom looms for
5958 some malformed UCS-4 data. */
5959 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005960#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005961 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005962#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005963 end-s < Py_UNICODE_SIZE
5964 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005966 startinpos = s - starts;
5967 if (end-s < Py_UNICODE_SIZE) {
5968 endinpos = end-starts;
5969 reason = "truncated input";
5970 }
5971 else {
5972 endinpos = s - starts + Py_UNICODE_SIZE;
5973 reason = "illegal code point (> 0x10FFFF)";
5974 }
5975 outpos = p - PyUnicode_AS_UNICODE(v);
5976 if (unicode_decode_call_errorhandler(
5977 errors, &errorHandler,
5978 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005979 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005980 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005981 goto onError;
5982 }
5983 }
5984 else {
5985 p++;
5986 s += Py_UNICODE_SIZE;
5987 }
5988 }
5989
Victor Stinnerfe226c02011-10-03 03:52:20 +02005990 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005991 goto onError;
5992 Py_XDECREF(errorHandler);
5993 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005994 if (PyUnicode_READY(v) == -1) {
5995 Py_DECREF(v);
5996 return NULL;
5997 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005998 return (PyObject *)v;
5999
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006001 Py_XDECREF(v);
6002 Py_XDECREF(errorHandler);
6003 Py_XDECREF(exc);
6004 return NULL;
6005}
6006
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007/* --- Latin-1 Codec ------------------------------------------------------ */
6008
Alexander Belopolsky40018472011-02-26 01:02:56 +00006009PyObject *
6010PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006011 Py_ssize_t size,
6012 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006015 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016}
6017
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006019static void
6020make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006021 const char *encoding,
6022 const Py_UNICODE *unicode, Py_ssize_t size,
6023 Py_ssize_t startpos, Py_ssize_t endpos,
6024 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 *exceptionObject = PyUnicodeEncodeError_Create(
6028 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 }
6030 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6032 goto onError;
6033 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6034 goto onError;
6035 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6036 goto onError;
6037 return;
6038 onError:
6039 Py_DECREF(*exceptionObject);
6040 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 }
6042}
6043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045static void
6046raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 const char *encoding,
6048 const Py_UNICODE *unicode, Py_ssize_t size,
6049 Py_ssize_t startpos, Py_ssize_t endpos,
6050 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051{
6052 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056}
6057
6058/* error handling callback helper:
6059 build arguments, call the callback and check the arguments,
6060 put the result into newpos and return the replacement string, which
6061 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006062static PyObject *
6063unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006064 PyObject **errorHandler,
6065 const char *encoding, const char *reason,
6066 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6067 Py_ssize_t startpos, Py_ssize_t endpos,
6068 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006070 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071
6072 PyObject *restuple;
6073 PyObject *resunicode;
6074
6075 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006077 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 }
6080
6081 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085
6086 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006091 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 Py_DECREF(restuple);
6093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006094 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006095 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 &resunicode, newpos)) {
6097 Py_DECREF(restuple);
6098 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006100 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6101 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6102 Py_DECREF(restuple);
6103 return NULL;
6104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006107 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6109 Py_DECREF(restuple);
6110 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006111 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_INCREF(resunicode);
6113 Py_DECREF(restuple);
6114 return resunicode;
6115}
6116
Alexander Belopolsky40018472011-02-26 01:02:56 +00006117static PyObject *
6118unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006119 Py_ssize_t size,
6120 const char *errors,
6121 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122{
6123 /* output object */
6124 PyObject *res;
6125 /* pointers to the beginning and end+1 of input */
6126 const Py_UNICODE *startp = p;
6127 const Py_UNICODE *endp = p + size;
6128 /* pointer to the beginning of the unencodable characters */
6129 /* const Py_UNICODE *badp = NULL; */
6130 /* pointer into the output */
6131 char *str;
6132 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006133 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006134 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6135 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 PyObject *errorHandler = NULL;
6137 PyObject *exc = NULL;
6138 /* the following variable is used for caching string comparisons
6139 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6140 int known_errorHandler = -1;
6141
6142 /* allocate enough for a simple encoding without
6143 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006144 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006145 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006146 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006148 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006149 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150 ressize = size;
6151
6152 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 /* can we encode this? */
6156 if (c<limit) {
6157 /* no overflow check, because we know that the space is enough */
6158 *str++ = (char)c;
6159 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006160 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 else {
6162 Py_ssize_t unicodepos = p-startp;
6163 Py_ssize_t requiredsize;
6164 PyObject *repunicode;
6165 Py_ssize_t repsize;
6166 Py_ssize_t newpos;
6167 Py_ssize_t respos;
6168 Py_UNICODE *uni2;
6169 /* startpos for collecting unencodable chars */
6170 const Py_UNICODE *collstart = p;
6171 const Py_UNICODE *collend = p;
6172 /* find all unecodable characters */
6173 while ((collend < endp) && ((*collend)>=limit))
6174 ++collend;
6175 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6176 if (known_errorHandler==-1) {
6177 if ((errors==NULL) || (!strcmp(errors, "strict")))
6178 known_errorHandler = 1;
6179 else if (!strcmp(errors, "replace"))
6180 known_errorHandler = 2;
6181 else if (!strcmp(errors, "ignore"))
6182 known_errorHandler = 3;
6183 else if (!strcmp(errors, "xmlcharrefreplace"))
6184 known_errorHandler = 4;
6185 else
6186 known_errorHandler = 0;
6187 }
6188 switch (known_errorHandler) {
6189 case 1: /* strict */
6190 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6191 goto onError;
6192 case 2: /* replace */
6193 while (collstart++<collend)
6194 *str++ = '?'; /* fall through */
6195 case 3: /* ignore */
6196 p = collend;
6197 break;
6198 case 4: /* xmlcharrefreplace */
6199 respos = str - PyBytes_AS_STRING(res);
6200 /* determine replacement size (temporarily (mis)uses p) */
6201 for (p = collstart, repsize = 0; p < collend; ++p) {
6202 if (*p<10)
6203 repsize += 2+1+1;
6204 else if (*p<100)
6205 repsize += 2+2+1;
6206 else if (*p<1000)
6207 repsize += 2+3+1;
6208 else if (*p<10000)
6209 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006210#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 else
6212 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006213#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 else if (*p<100000)
6215 repsize += 2+5+1;
6216 else if (*p<1000000)
6217 repsize += 2+6+1;
6218 else
6219 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006220#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 }
6222 requiredsize = respos+repsize+(endp-collend);
6223 if (requiredsize > ressize) {
6224 if (requiredsize<2*ressize)
6225 requiredsize = 2*ressize;
6226 if (_PyBytes_Resize(&res, requiredsize))
6227 goto onError;
6228 str = PyBytes_AS_STRING(res) + respos;
6229 ressize = requiredsize;
6230 }
6231 /* generate replacement (temporarily (mis)uses p) */
6232 for (p = collstart; p < collend; ++p) {
6233 str += sprintf(str, "&#%d;", (int)*p);
6234 }
6235 p = collend;
6236 break;
6237 default:
6238 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6239 encoding, reason, startp, size, &exc,
6240 collstart-startp, collend-startp, &newpos);
6241 if (repunicode == NULL)
6242 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006243 if (PyBytes_Check(repunicode)) {
6244 /* Directly copy bytes result to output. */
6245 repsize = PyBytes_Size(repunicode);
6246 if (repsize > 1) {
6247 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006248 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006249 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6250 Py_DECREF(repunicode);
6251 goto onError;
6252 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006253 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006254 ressize += repsize-1;
6255 }
6256 memcpy(str, PyBytes_AsString(repunicode), repsize);
6257 str += repsize;
6258 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006259 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006260 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006261 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 /* need more space? (at least enough for what we
6263 have+the replacement+the rest of the string, so
6264 we won't have to check space for encodable characters) */
6265 respos = str - PyBytes_AS_STRING(res);
6266 repsize = PyUnicode_GET_SIZE(repunicode);
6267 requiredsize = respos+repsize+(endp-collend);
6268 if (requiredsize > ressize) {
6269 if (requiredsize<2*ressize)
6270 requiredsize = 2*ressize;
6271 if (_PyBytes_Resize(&res, requiredsize)) {
6272 Py_DECREF(repunicode);
6273 goto onError;
6274 }
6275 str = PyBytes_AS_STRING(res) + respos;
6276 ressize = requiredsize;
6277 }
6278 /* check if there is anything unencodable in the replacement
6279 and copy it to the output */
6280 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6281 c = *uni2;
6282 if (c >= limit) {
6283 raise_encode_exception(&exc, encoding, startp, size,
6284 unicodepos, unicodepos+1, reason);
6285 Py_DECREF(repunicode);
6286 goto onError;
6287 }
6288 *str = (char)c;
6289 }
6290 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006291 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006293 }
6294 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006295 /* Resize if we allocated to much */
6296 size = str - PyBytes_AS_STRING(res);
6297 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006298 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006299 if (_PyBytes_Resize(&res, size) < 0)
6300 goto onError;
6301 }
6302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303 Py_XDECREF(errorHandler);
6304 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006305 return res;
6306
6307 onError:
6308 Py_XDECREF(res);
6309 Py_XDECREF(errorHandler);
6310 Py_XDECREF(exc);
6311 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006312}
6313
Alexander Belopolsky40018472011-02-26 01:02:56 +00006314PyObject *
6315PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006316 Py_ssize_t size,
6317 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320}
6321
Alexander Belopolsky40018472011-02-26 01:02:56 +00006322PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006323_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
6325 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 PyErr_BadArgument();
6327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006329 if (PyUnicode_READY(unicode) == -1)
6330 return NULL;
6331 /* Fast path: if it is a one-byte string, construct
6332 bytes object directly. */
6333 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6334 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6335 PyUnicode_GET_LENGTH(unicode));
6336 /* Non-Latin-1 characters present. Defer to above function to
6337 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006339 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006340 errors);
6341}
6342
6343PyObject*
6344PyUnicode_AsLatin1String(PyObject *unicode)
6345{
6346 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347}
6348
6349/* --- 7-bit ASCII Codec -------------------------------------------------- */
6350
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351PyObject *
6352PyUnicode_DecodeASCII(const char *s,
6353 Py_ssize_t size,
6354 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006357 PyUnicodeObject *v;
6358 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006359 Py_ssize_t startinpos;
6360 Py_ssize_t endinpos;
6361 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006363 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 PyObject *errorHandler = NULL;
6365 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006366 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006369 if (size == 1 && *(unsigned char*)s < 128)
6370 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6371
6372 /* Fast path. Assume the input actually *is* ASCII, and allocate
6373 a single-block Unicode object with that assumption. If there is
6374 an error, drop the object and start over. */
6375 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6376 if (v == NULL)
6377 goto onError;
6378 d = PyUnicode_1BYTE_DATA(v);
6379 for (i = 0; i < size; i++) {
6380 unsigned char ch = ((unsigned char*)s)[i];
6381 if (ch < 128)
6382 d[i] = ch;
6383 else
6384 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006386 if (i == size)
6387 return (PyObject*)v;
6388 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006389
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390 v = _PyUnicode_New(size);
6391 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006392 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 e = s + size;
6397 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 register unsigned char c = (unsigned char)*s;
6399 if (c < 128) {
6400 *p++ = c;
6401 ++s;
6402 }
6403 else {
6404 startinpos = s-starts;
6405 endinpos = startinpos + 1;
6406 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6407 if (unicode_decode_call_errorhandler(
6408 errors, &errorHandler,
6409 "ascii", "ordinal not in range(128)",
6410 &starts, &e, &startinpos, &endinpos, &exc, &s,
6411 &v, &outpos, &p))
6412 goto onError;
6413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006415 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006416 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 Py_XDECREF(errorHandler);
6419 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006420 if (PyUnicode_READY(v) == -1) {
6421 Py_DECREF(v);
6422 return NULL;
6423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006425
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 Py_XDECREF(errorHandler);
6429 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 return NULL;
6431}
6432
Alexander Belopolsky40018472011-02-26 01:02:56 +00006433PyObject *
6434PyUnicode_EncodeASCII(const Py_UNICODE *p,
6435 Py_ssize_t size,
6436 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439}
6440
Alexander Belopolsky40018472011-02-26 01:02:56 +00006441PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006442_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443{
6444 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 PyErr_BadArgument();
6446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006448 if (PyUnicode_READY(unicode) == -1)
6449 return NULL;
6450 /* Fast path: if it is an ASCII-only string, construct bytes object
6451 directly. Else defer to above function to raise the exception. */
6452 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6453 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6454 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006457 errors);
6458}
6459
6460PyObject *
6461PyUnicode_AsASCIIString(PyObject *unicode)
6462{
6463 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464}
6465
Victor Stinner99b95382011-07-04 14:23:54 +02006466#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006467
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006468/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006469
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006470#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006471#define NEED_RETRY
6472#endif
6473
6474/* XXX This code is limited to "true" double-byte encodings, as
6475 a) it assumes an incomplete character consists of a single byte, and
6476 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006477 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006478
Alexander Belopolsky40018472011-02-26 01:02:56 +00006479static int
6480is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006481{
6482 const char *curr = s + offset;
6483
6484 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 const char *prev = CharPrev(s, curr);
6486 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006487 }
6488 return 0;
6489}
6490
6491/*
6492 * Decode MBCS string into unicode object. If 'final' is set, converts
6493 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6494 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006495static int
6496decode_mbcs(PyUnicodeObject **v,
6497 const char *s, /* MBCS string */
6498 int size, /* sizeof MBCS string */
6499 int final,
6500 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006501{
6502 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006503 Py_ssize_t n;
6504 DWORD usize;
6505 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006506
6507 assert(size >= 0);
6508
Victor Stinner554f3f02010-06-16 23:33:54 +00006509 /* check and handle 'errors' arg */
6510 if (errors==NULL || strcmp(errors, "strict")==0)
6511 flags = MB_ERR_INVALID_CHARS;
6512 else if (strcmp(errors, "ignore")==0)
6513 flags = 0;
6514 else {
6515 PyErr_Format(PyExc_ValueError,
6516 "mbcs encoding does not support errors='%s'",
6517 errors);
6518 return -1;
6519 }
6520
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006521 /* Skip trailing lead-byte unless 'final' is set */
6522 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006524
6525 /* First get the size of the result */
6526 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006527 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6528 if (usize==0)
6529 goto mbcs_decode_error;
6530 } else
6531 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006532
6533 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 /* Create unicode object */
6535 *v = _PyUnicode_New(usize);
6536 if (*v == NULL)
6537 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006538 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006539 }
6540 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 /* Extend unicode object */
6542 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006543 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545 }
6546
6547 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006548 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006550 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6551 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006554 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006555
6556mbcs_decode_error:
6557 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6558 we raise a UnicodeDecodeError - else it is a 'generic'
6559 windows error
6560 */
6561 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6562 /* Ideally, we should get reason from FormatMessage - this
6563 is the Windows 2000 English version of the message
6564 */
6565 PyObject *exc = NULL;
6566 const char *reason = "No mapping for the Unicode character exists "
6567 "in the target multi-byte code page.";
6568 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6569 if (exc != NULL) {
6570 PyCodec_StrictErrors(exc);
6571 Py_DECREF(exc);
6572 }
6573 } else {
6574 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6575 }
6576 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006577}
6578
Alexander Belopolsky40018472011-02-26 01:02:56 +00006579PyObject *
6580PyUnicode_DecodeMBCSStateful(const char *s,
6581 Py_ssize_t size,
6582 const char *errors,
6583 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006584{
6585 PyUnicodeObject *v = NULL;
6586 int done;
6587
6588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590
6591#ifdef NEED_RETRY
6592 retry:
6593 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006594 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006595 else
6596#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006597 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006598
6599 if (done < 0) {
6600 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602 }
6603
6604 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606
6607#ifdef NEED_RETRY
6608 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 s += done;
6610 size -= done;
6611 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006612 }
6613#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006614 if (PyUnicode_READY(v) == -1) {
6615 Py_DECREF(v);
6616 return NULL;
6617 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006618 return (PyObject *)v;
6619}
6620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621PyObject *
6622PyUnicode_DecodeMBCS(const char *s,
6623 Py_ssize_t size,
6624 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006625{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6627}
6628
6629/*
6630 * Convert unicode into string object (MBCS).
6631 * Returns 0 if succeed, -1 otherwise.
6632 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006633static int
6634encode_mbcs(PyObject **repr,
6635 const Py_UNICODE *p, /* unicode */
6636 int size, /* size of unicode */
6637 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638{
Victor Stinner554f3f02010-06-16 23:33:54 +00006639 BOOL usedDefaultChar = FALSE;
6640 BOOL *pusedDefaultChar;
6641 int mbcssize;
6642 Py_ssize_t n;
6643 PyObject *exc = NULL;
6644 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006645
6646 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006647
Victor Stinner554f3f02010-06-16 23:33:54 +00006648 /* check and handle 'errors' arg */
6649 if (errors==NULL || strcmp(errors, "strict")==0) {
6650 flags = WC_NO_BEST_FIT_CHARS;
6651 pusedDefaultChar = &usedDefaultChar;
6652 } else if (strcmp(errors, "replace")==0) {
6653 flags = 0;
6654 pusedDefaultChar = NULL;
6655 } else {
6656 PyErr_Format(PyExc_ValueError,
6657 "mbcs encoding does not support errors='%s'",
6658 errors);
6659 return -1;
6660 }
6661
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006662 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006664 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6665 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 if (mbcssize == 0) {
6667 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6668 return -1;
6669 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006670 /* If we used a default char, then we failed! */
6671 if (pusedDefaultChar && *pusedDefaultChar)
6672 goto mbcs_encode_error;
6673 } else {
6674 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006675 }
6676
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 /* Create string object */
6679 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6680 if (*repr == NULL)
6681 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006682 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683 }
6684 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 /* Extend string object */
6686 n = PyBytes_Size(*repr);
6687 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6688 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006689 }
6690
6691 /* Do the conversion */
6692 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006694 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6695 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6697 return -1;
6698 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006699 if (pusedDefaultChar && *pusedDefaultChar)
6700 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006702 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006703
6704mbcs_encode_error:
6705 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6706 Py_XDECREF(exc);
6707 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006708}
6709
Alexander Belopolsky40018472011-02-26 01:02:56 +00006710PyObject *
6711PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6712 Py_ssize_t size,
6713 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006714{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715 PyObject *repr = NULL;
6716 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006717
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006721 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006722 else
6723#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006724 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006725
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 Py_XDECREF(repr);
6728 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006729 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730
6731#ifdef NEED_RETRY
6732 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 p += INT_MAX;
6734 size -= INT_MAX;
6735 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006736 }
6737#endif
6738
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006739 return repr;
6740}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006741
Alexander Belopolsky40018472011-02-26 01:02:56 +00006742PyObject *
6743PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006744{
6745 if (!PyUnicode_Check(unicode)) {
6746 PyErr_BadArgument();
6747 return NULL;
6748 }
6749 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 PyUnicode_GET_SIZE(unicode),
6751 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006752}
6753
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006754#undef NEED_RETRY
6755
Victor Stinner99b95382011-07-04 14:23:54 +02006756#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006757
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758/* --- Character Mapping Codec -------------------------------------------- */
6759
Alexander Belopolsky40018472011-02-26 01:02:56 +00006760PyObject *
6761PyUnicode_DecodeCharmap(const char *s,
6762 Py_ssize_t size,
6763 PyObject *mapping,
6764 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006767 Py_ssize_t startinpos;
6768 Py_ssize_t endinpos;
6769 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 PyUnicodeObject *v;
6772 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006773 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006774 PyObject *errorHandler = NULL;
6775 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006776 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006777 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006778
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 /* Default to Latin-1 */
6780 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782
6783 v = _PyUnicode_New(size);
6784 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006789 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006790 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 mapstring = PyUnicode_AS_UNICODE(mapping);
6792 maplen = PyUnicode_GET_SIZE(mapping);
6793 while (s < e) {
6794 unsigned char ch = *s;
6795 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 if (ch < maplen)
6798 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799
Benjamin Peterson29060642009-01-31 22:14:21 +00006800 if (x == 0xfffe) {
6801 /* undefined mapping */
6802 outpos = p-PyUnicode_AS_UNICODE(v);
6803 startinpos = s-starts;
6804 endinpos = startinpos+1;
6805 if (unicode_decode_call_errorhandler(
6806 errors, &errorHandler,
6807 "charmap", "character maps to <undefined>",
6808 &starts, &e, &startinpos, &endinpos, &exc, &s,
6809 &v, &outpos, &p)) {
6810 goto onError;
6811 }
6812 continue;
6813 }
6814 *p++ = x;
6815 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006816 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006817 }
6818 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 while (s < e) {
6820 unsigned char ch = *s;
6821 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006822
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6824 w = PyLong_FromLong((long)ch);
6825 if (w == NULL)
6826 goto onError;
6827 x = PyObject_GetItem(mapping, w);
6828 Py_DECREF(w);
6829 if (x == NULL) {
6830 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6831 /* No mapping found means: mapping is undefined. */
6832 PyErr_Clear();
6833 x = Py_None;
6834 Py_INCREF(x);
6835 } else
6836 goto onError;
6837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006838
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 /* Apply mapping */
6840 if (PyLong_Check(x)) {
6841 long value = PyLong_AS_LONG(x);
6842 if (value < 0 || value > 65535) {
6843 PyErr_SetString(PyExc_TypeError,
6844 "character mapping must be in range(65536)");
6845 Py_DECREF(x);
6846 goto onError;
6847 }
6848 *p++ = (Py_UNICODE)value;
6849 }
6850 else if (x == Py_None) {
6851 /* undefined mapping */
6852 outpos = p-PyUnicode_AS_UNICODE(v);
6853 startinpos = s-starts;
6854 endinpos = startinpos+1;
6855 if (unicode_decode_call_errorhandler(
6856 errors, &errorHandler,
6857 "charmap", "character maps to <undefined>",
6858 &starts, &e, &startinpos, &endinpos, &exc, &s,
6859 &v, &outpos, &p)) {
6860 Py_DECREF(x);
6861 goto onError;
6862 }
6863 Py_DECREF(x);
6864 continue;
6865 }
6866 else if (PyUnicode_Check(x)) {
6867 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006868
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 if (targetsize == 1)
6870 /* 1-1 mapping */
6871 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006872
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 else if (targetsize > 1) {
6874 /* 1-n mapping */
6875 if (targetsize > extrachars) {
6876 /* resize first */
6877 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6878 Py_ssize_t needed = (targetsize - extrachars) + \
6879 (targetsize << 2);
6880 extrachars += needed;
6881 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006882 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 PyUnicode_GET_SIZE(v) + needed) < 0) {
6884 Py_DECREF(x);
6885 goto onError;
6886 }
6887 p = PyUnicode_AS_UNICODE(v) + oldpos;
6888 }
6889 Py_UNICODE_COPY(p,
6890 PyUnicode_AS_UNICODE(x),
6891 targetsize);
6892 p += targetsize;
6893 extrachars -= targetsize;
6894 }
6895 /* 1-0 mapping: skip the character */
6896 }
6897 else {
6898 /* wrong return value */
6899 PyErr_SetString(PyExc_TypeError,
6900 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 Py_DECREF(x);
6902 goto onError;
6903 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 Py_DECREF(x);
6905 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 }
6908 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006909 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 Py_XDECREF(errorHandler);
6912 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006913 if (PyUnicode_READY(v) == -1) {
6914 Py_DECREF(v);
6915 return NULL;
6916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006918
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006920 Py_XDECREF(errorHandler);
6921 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922 Py_XDECREF(v);
6923 return NULL;
6924}
6925
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006926/* Charmap encoding: the lookup table */
6927
Alexander Belopolsky40018472011-02-26 01:02:56 +00006928struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 PyObject_HEAD
6930 unsigned char level1[32];
6931 int count2, count3;
6932 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006933};
6934
6935static PyObject*
6936encoding_map_size(PyObject *obj, PyObject* args)
6937{
6938 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006939 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006941}
6942
6943static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006944 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 PyDoc_STR("Return the size (in bytes) of this object") },
6946 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006947};
6948
6949static void
6950encoding_map_dealloc(PyObject* o)
6951{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006952 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006953}
6954
6955static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006956 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 "EncodingMap", /*tp_name*/
6958 sizeof(struct encoding_map), /*tp_basicsize*/
6959 0, /*tp_itemsize*/
6960 /* methods */
6961 encoding_map_dealloc, /*tp_dealloc*/
6962 0, /*tp_print*/
6963 0, /*tp_getattr*/
6964 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006965 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 0, /*tp_repr*/
6967 0, /*tp_as_number*/
6968 0, /*tp_as_sequence*/
6969 0, /*tp_as_mapping*/
6970 0, /*tp_hash*/
6971 0, /*tp_call*/
6972 0, /*tp_str*/
6973 0, /*tp_getattro*/
6974 0, /*tp_setattro*/
6975 0, /*tp_as_buffer*/
6976 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6977 0, /*tp_doc*/
6978 0, /*tp_traverse*/
6979 0, /*tp_clear*/
6980 0, /*tp_richcompare*/
6981 0, /*tp_weaklistoffset*/
6982 0, /*tp_iter*/
6983 0, /*tp_iternext*/
6984 encoding_map_methods, /*tp_methods*/
6985 0, /*tp_members*/
6986 0, /*tp_getset*/
6987 0, /*tp_base*/
6988 0, /*tp_dict*/
6989 0, /*tp_descr_get*/
6990 0, /*tp_descr_set*/
6991 0, /*tp_dictoffset*/
6992 0, /*tp_init*/
6993 0, /*tp_alloc*/
6994 0, /*tp_new*/
6995 0, /*tp_free*/
6996 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006997};
6998
6999PyObject*
7000PyUnicode_BuildEncodingMap(PyObject* string)
7001{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007002 PyObject *result;
7003 struct encoding_map *mresult;
7004 int i;
7005 int need_dict = 0;
7006 unsigned char level1[32];
7007 unsigned char level2[512];
7008 unsigned char *mlevel1, *mlevel2, *mlevel3;
7009 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007010 int kind;
7011 void *data;
7012 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007015 PyErr_BadArgument();
7016 return NULL;
7017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007018 kind = PyUnicode_KIND(string);
7019 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007020 memset(level1, 0xFF, sizeof level1);
7021 memset(level2, 0xFF, sizeof level2);
7022
7023 /* If there isn't a one-to-one mapping of NULL to \0,
7024 or if there are non-BMP characters, we need to use
7025 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007027 need_dict = 1;
7028 for (i = 1; i < 256; i++) {
7029 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007030 ch = PyUnicode_READ(kind, data, i);
7031 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007032 need_dict = 1;
7033 break;
7034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007035 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007036 /* unmapped character */
7037 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007038 l1 = ch >> 11;
7039 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007040 if (level1[l1] == 0xFF)
7041 level1[l1] = count2++;
7042 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007043 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007044 }
7045
7046 if (count2 >= 0xFF || count3 >= 0xFF)
7047 need_dict = 1;
7048
7049 if (need_dict) {
7050 PyObject *result = PyDict_New();
7051 PyObject *key, *value;
7052 if (!result)
7053 return NULL;
7054 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007055 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007056 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007057 if (!key || !value)
7058 goto failed1;
7059 if (PyDict_SetItem(result, key, value) == -1)
7060 goto failed1;
7061 Py_DECREF(key);
7062 Py_DECREF(value);
7063 }
7064 return result;
7065 failed1:
7066 Py_XDECREF(key);
7067 Py_XDECREF(value);
7068 Py_DECREF(result);
7069 return NULL;
7070 }
7071
7072 /* Create a three-level trie */
7073 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7074 16*count2 + 128*count3 - 1);
7075 if (!result)
7076 return PyErr_NoMemory();
7077 PyObject_Init(result, &EncodingMapType);
7078 mresult = (struct encoding_map*)result;
7079 mresult->count2 = count2;
7080 mresult->count3 = count3;
7081 mlevel1 = mresult->level1;
7082 mlevel2 = mresult->level23;
7083 mlevel3 = mresult->level23 + 16*count2;
7084 memcpy(mlevel1, level1, 32);
7085 memset(mlevel2, 0xFF, 16*count2);
7086 memset(mlevel3, 0, 128*count3);
7087 count3 = 0;
7088 for (i = 1; i < 256; i++) {
7089 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007090 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007091 /* unmapped character */
7092 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007093 o1 = PyUnicode_READ(kind, data, i)>>11;
7094 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007095 i2 = 16*mlevel1[o1] + o2;
7096 if (mlevel2[i2] == 0xFF)
7097 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007099 i3 = 128*mlevel2[i2] + o3;
7100 mlevel3[i3] = i;
7101 }
7102 return result;
7103}
7104
7105static int
7106encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7107{
7108 struct encoding_map *map = (struct encoding_map*)mapping;
7109 int l1 = c>>11;
7110 int l2 = (c>>7) & 0xF;
7111 int l3 = c & 0x7F;
7112 int i;
7113
7114#ifdef Py_UNICODE_WIDE
7115 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007116 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007117 }
7118#endif
7119 if (c == 0)
7120 return 0;
7121 /* level 1*/
7122 i = map->level1[l1];
7123 if (i == 0xFF) {
7124 return -1;
7125 }
7126 /* level 2*/
7127 i = map->level23[16*i+l2];
7128 if (i == 0xFF) {
7129 return -1;
7130 }
7131 /* level 3 */
7132 i = map->level23[16*map->count2 + 128*i + l3];
7133 if (i == 0) {
7134 return -1;
7135 }
7136 return i;
7137}
7138
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139/* Lookup the character ch in the mapping. If the character
7140 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007141 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007142static PyObject *
7143charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Christian Heimes217cfd12007-12-02 14:31:20 +00007145 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007146 PyObject *x;
7147
7148 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007150 x = PyObject_GetItem(mapping, w);
7151 Py_DECREF(w);
7152 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7154 /* No mapping found means: mapping is undefined. */
7155 PyErr_Clear();
7156 x = Py_None;
7157 Py_INCREF(x);
7158 return x;
7159 } else
7160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007162 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007164 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 long value = PyLong_AS_LONG(x);
7166 if (value < 0 || value > 255) {
7167 PyErr_SetString(PyExc_TypeError,
7168 "character mapping must be in range(256)");
7169 Py_DECREF(x);
7170 return NULL;
7171 }
7172 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007174 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 /* wrong return value */
7178 PyErr_Format(PyExc_TypeError,
7179 "character mapping must return integer, bytes or None, not %.400s",
7180 x->ob_type->tp_name);
7181 Py_DECREF(x);
7182 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 }
7184}
7185
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007186static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007187charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7190 /* exponentially overallocate to minimize reallocations */
7191 if (requiredsize < 2*outsize)
7192 requiredsize = 2*outsize;
7193 if (_PyBytes_Resize(outobj, requiredsize))
7194 return -1;
7195 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007196}
7197
Benjamin Peterson14339b62009-01-31 16:36:08 +00007198typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007200} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007202 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007203 space is available. Return a new reference to the object that
7204 was put in the output buffer, or Py_None, if the mapping was undefined
7205 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007206 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007207static charmapencode_result
7208charmapencode_output(Py_UNICODE c, PyObject *mapping,
7209 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007211 PyObject *rep;
7212 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007213 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007214
Christian Heimes90aa7642007-12-19 02:45:37 +00007215 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007216 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007218 if (res == -1)
7219 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 if (outsize<requiredsize)
7221 if (charmapencode_resize(outobj, outpos, requiredsize))
7222 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007223 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 outstart[(*outpos)++] = (char)res;
7225 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007226 }
7227
7228 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007229 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007231 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 Py_DECREF(rep);
7233 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007234 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007235 if (PyLong_Check(rep)) {
7236 Py_ssize_t requiredsize = *outpos+1;
7237 if (outsize<requiredsize)
7238 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7239 Py_DECREF(rep);
7240 return enc_EXCEPTION;
7241 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007242 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007244 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007245 else {
7246 const char *repchars = PyBytes_AS_STRING(rep);
7247 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7248 Py_ssize_t requiredsize = *outpos+repsize;
7249 if (outsize<requiredsize)
7250 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7251 Py_DECREF(rep);
7252 return enc_EXCEPTION;
7253 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007254 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 memcpy(outstart + *outpos, repchars, repsize);
7256 *outpos += repsize;
7257 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007259 Py_DECREF(rep);
7260 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007261}
7262
7263/* handle an error in PyUnicode_EncodeCharmap
7264 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007265static int
7266charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007267 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007268 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007269 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007270 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271{
7272 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 Py_ssize_t repsize;
7274 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275 Py_UNICODE *uni2;
7276 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007277 Py_ssize_t collstartpos = *inpos;
7278 Py_ssize_t collendpos = *inpos+1;
7279 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007280 char *encoding = "charmap";
7281 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007282 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007283
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007284 /* find all unencodable characters */
7285 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007286 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007287 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007288 int res = encoding_map_lookup(p[collendpos], mapping);
7289 if (res != -1)
7290 break;
7291 ++collendpos;
7292 continue;
7293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007294
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 rep = charmapencode_lookup(p[collendpos], mapping);
7296 if (rep==NULL)
7297 return -1;
7298 else if (rep!=Py_None) {
7299 Py_DECREF(rep);
7300 break;
7301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007302 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007303 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007304 }
7305 /* cache callback name lookup
7306 * (if not done yet, i.e. it's the first error) */
7307 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007308 if ((errors==NULL) || (!strcmp(errors, "strict")))
7309 *known_errorHandler = 1;
7310 else if (!strcmp(errors, "replace"))
7311 *known_errorHandler = 2;
7312 else if (!strcmp(errors, "ignore"))
7313 *known_errorHandler = 3;
7314 else if (!strcmp(errors, "xmlcharrefreplace"))
7315 *known_errorHandler = 4;
7316 else
7317 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007318 }
7319 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007320 case 1: /* strict */
7321 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7322 return -1;
7323 case 2: /* replace */
7324 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 x = charmapencode_output('?', mapping, res, respos);
7326 if (x==enc_EXCEPTION) {
7327 return -1;
7328 }
7329 else if (x==enc_FAILED) {
7330 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7331 return -1;
7332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007333 }
7334 /* fall through */
7335 case 3: /* ignore */
7336 *inpos = collendpos;
7337 break;
7338 case 4: /* xmlcharrefreplace */
7339 /* generate replacement (temporarily (mis)uses p) */
7340 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 char buffer[2+29+1+1];
7342 char *cp;
7343 sprintf(buffer, "&#%d;", (int)p[collpos]);
7344 for (cp = buffer; *cp; ++cp) {
7345 x = charmapencode_output(*cp, mapping, res, respos);
7346 if (x==enc_EXCEPTION)
7347 return -1;
7348 else if (x==enc_FAILED) {
7349 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7350 return -1;
7351 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352 }
7353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007354 *inpos = collendpos;
7355 break;
7356 default:
7357 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 encoding, reason, p, size, exceptionObject,
7359 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007360 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007362 if (PyBytes_Check(repunicode)) {
7363 /* Directly copy bytes result to output. */
7364 Py_ssize_t outsize = PyBytes_Size(*res);
7365 Py_ssize_t requiredsize;
7366 repsize = PyBytes_Size(repunicode);
7367 requiredsize = *respos + repsize;
7368 if (requiredsize > outsize)
7369 /* Make room for all additional bytes. */
7370 if (charmapencode_resize(res, respos, requiredsize)) {
7371 Py_DECREF(repunicode);
7372 return -1;
7373 }
7374 memcpy(PyBytes_AsString(*res) + *respos,
7375 PyBytes_AsString(repunicode), repsize);
7376 *respos += repsize;
7377 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007378 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007379 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 /* generate replacement */
7382 repsize = PyUnicode_GET_SIZE(repunicode);
7383 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 x = charmapencode_output(*uni2, mapping, res, respos);
7385 if (x==enc_EXCEPTION) {
7386 return -1;
7387 }
7388 else if (x==enc_FAILED) {
7389 Py_DECREF(repunicode);
7390 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7391 return -1;
7392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007393 }
7394 *inpos = newpos;
7395 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 }
7397 return 0;
7398}
7399
Alexander Belopolsky40018472011-02-26 01:02:56 +00007400PyObject *
7401PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7402 Py_ssize_t size,
7403 PyObject *mapping,
7404 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007406 /* output object */
7407 PyObject *res = NULL;
7408 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007409 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007410 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007411 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007412 PyObject *errorHandler = NULL;
7413 PyObject *exc = NULL;
7414 /* the following variable is used for caching string comparisons
7415 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7416 * 3=ignore, 4=xmlcharrefreplace */
7417 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
7419 /* Default to Latin-1 */
7420 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007423 /* allocate enough for a simple encoding without
7424 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007425 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007426 if (res == NULL)
7427 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007428 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007431 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 /* try to encode it */
7433 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7434 if (x==enc_EXCEPTION) /* error */
7435 goto onError;
7436 if (x==enc_FAILED) { /* unencodable character */
7437 if (charmap_encoding_error(p, size, &inpos, mapping,
7438 &exc,
7439 &known_errorHandler, &errorHandler, errors,
7440 &res, &respos)) {
7441 goto onError;
7442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 else
7445 /* done with this character => adjust input position */
7446 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007450 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007451 if (_PyBytes_Resize(&res, respos) < 0)
7452 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007454 Py_XDECREF(exc);
7455 Py_XDECREF(errorHandler);
7456 return res;
7457
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007459 Py_XDECREF(res);
7460 Py_XDECREF(exc);
7461 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 return NULL;
7463}
7464
Alexander Belopolsky40018472011-02-26 01:02:56 +00007465PyObject *
7466PyUnicode_AsCharmapString(PyObject *unicode,
7467 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468{
7469 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 PyErr_BadArgument();
7471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 }
7473 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 PyUnicode_GET_SIZE(unicode),
7475 mapping,
7476 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477}
7478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007479/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007480static void
7481make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007482 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007483 Py_ssize_t startpos, Py_ssize_t endpos,
7484 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007486 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007487 *exceptionObject = _PyUnicodeTranslateError_Create(
7488 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 }
7490 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7492 goto onError;
7493 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7494 goto onError;
7495 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7496 goto onError;
7497 return;
7498 onError:
7499 Py_DECREF(*exceptionObject);
7500 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501 }
7502}
7503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007505static void
7506raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007507 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007508 Py_ssize_t startpos, Py_ssize_t endpos,
7509 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510{
7511 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007512 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007513 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515}
7516
7517/* error handling callback helper:
7518 build arguments, call the callback and check the arguments,
7519 put the result into newpos and return the replacement string, which
7520 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007521static PyObject *
7522unicode_translate_call_errorhandler(const char *errors,
7523 PyObject **errorHandler,
7524 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007525 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007526 Py_ssize_t startpos, Py_ssize_t endpos,
7527 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007529 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007531 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007532 PyObject *restuple;
7533 PyObject *resunicode;
7534
7535 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 }
7540
7541 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007542 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545
7546 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007548 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007551 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 Py_DECREF(restuple);
7553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554 }
7555 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 &resunicode, &i_newpos)) {
7557 Py_DECREF(restuple);
7558 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007560 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007561 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007562 else
7563 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007565 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7566 Py_DECREF(restuple);
7567 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 Py_INCREF(resunicode);
7570 Py_DECREF(restuple);
7571 return resunicode;
7572}
7573
7574/* Lookup the character ch in the mapping and put the result in result,
7575 which must be decrefed by the caller.
7576 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007577static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007578charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007579{
Christian Heimes217cfd12007-12-02 14:31:20 +00007580 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 PyObject *x;
7582
7583 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585 x = PyObject_GetItem(mapping, w);
7586 Py_DECREF(w);
7587 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7589 /* No mapping found means: use 1:1 mapping. */
7590 PyErr_Clear();
7591 *result = NULL;
7592 return 0;
7593 } else
7594 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595 }
7596 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 *result = x;
7598 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007600 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 long value = PyLong_AS_LONG(x);
7602 long max = PyUnicode_GetMax();
7603 if (value < 0 || value > max) {
7604 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007605 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 Py_DECREF(x);
7607 return -1;
7608 }
7609 *result = x;
7610 return 0;
7611 }
7612 else if (PyUnicode_Check(x)) {
7613 *result = x;
7614 return 0;
7615 }
7616 else {
7617 /* wrong return value */
7618 PyErr_SetString(PyExc_TypeError,
7619 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 Py_DECREF(x);
7621 return -1;
7622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007623}
7624/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 if not reallocate and adjust various state variables.
7626 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007627static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007628charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007631 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007632 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 /* exponentially overallocate to minimize reallocations */
7634 if (requiredsize < 2 * oldsize)
7635 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007636 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7637 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007639 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640 }
7641 return 0;
7642}
7643/* lookup the character, put the result in the output string and adjust
7644 various state variables. Return a new reference to the object that
7645 was put in the output buffer in *result, or Py_None, if the mapping was
7646 undefined (in which case no character was written).
7647 The called must decref result.
7648 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007649static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007650charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7651 PyObject *mapping, Py_UCS4 **output,
7652 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007653 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007655 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7656 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007660 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 }
7662 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007664 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007667 }
7668 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007669 Py_ssize_t repsize;
7670 if (PyUnicode_READY(*res) == -1)
7671 return -1;
7672 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 if (repsize==1) {
7674 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007675 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 }
7677 else if (repsize!=0) {
7678 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007679 Py_ssize_t requiredsize = *opos +
7680 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007682 Py_ssize_t i;
7683 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007685 for(i = 0; i < repsize; i++)
7686 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 }
7689 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691 return 0;
7692}
7693
Alexander Belopolsky40018472011-02-26 01:02:56 +00007694PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007695_PyUnicode_TranslateCharmap(PyObject *input,
7696 PyObject *mapping,
7697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699 /* input object */
7700 char *idata;
7701 Py_ssize_t size, i;
7702 int kind;
7703 /* output buffer */
7704 Py_UCS4 *output = NULL;
7705 Py_ssize_t osize;
7706 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 char *reason = "character maps to <undefined>";
7710 PyObject *errorHandler = NULL;
7711 PyObject *exc = NULL;
7712 /* the following variable is used for caching string comparisons
7713 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7714 * 3=ignore, 4=xmlcharrefreplace */
7715 int known_errorHandler = -1;
7716
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 PyErr_BadArgument();
7719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007722 if (PyUnicode_READY(input) == -1)
7723 return NULL;
7724 idata = (char*)PyUnicode_DATA(input);
7725 kind = PyUnicode_KIND(input);
7726 size = PyUnicode_GET_LENGTH(input);
7727 i = 0;
7728
7729 if (size == 0) {
7730 Py_INCREF(input);
7731 return input;
7732 }
7733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007734 /* allocate enough for a simple 1:1 translation without
7735 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736 osize = size;
7737 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7738 opos = 0;
7739 if (output == NULL) {
7740 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007744 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 /* try to encode it */
7746 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 if (charmaptranslate_output(input, i, mapping,
7748 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 Py_XDECREF(x);
7750 goto onError;
7751 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 else { /* untranslatable character */
7756 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7757 Py_ssize_t repsize;
7758 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007759 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007761 Py_ssize_t collstart = i;
7762 Py_ssize_t collend = i+1;
7763 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007766 while (collend < size) {
7767 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 goto onError;
7769 Py_XDECREF(x);
7770 if (x!=Py_None)
7771 break;
7772 ++collend;
7773 }
7774 /* cache callback name lookup
7775 * (if not done yet, i.e. it's the first error) */
7776 if (known_errorHandler==-1) {
7777 if ((errors==NULL) || (!strcmp(errors, "strict")))
7778 known_errorHandler = 1;
7779 else if (!strcmp(errors, "replace"))
7780 known_errorHandler = 2;
7781 else if (!strcmp(errors, "ignore"))
7782 known_errorHandler = 3;
7783 else if (!strcmp(errors, "xmlcharrefreplace"))
7784 known_errorHandler = 4;
7785 else
7786 known_errorHandler = 0;
7787 }
7788 switch (known_errorHandler) {
7789 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 raise_translate_exception(&exc, input, collstart,
7791 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 case 2: /* replace */
7794 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007795 for (coll = collstart; coll<collend; coll++)
7796 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 /* fall through */
7798 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 break;
7801 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 /* generate replacement (temporarily (mis)uses i) */
7803 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 char buffer[2+29+1+1];
7805 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7807 if (charmaptranslate_makespace(&output, &osize,
7808 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 goto onError;
7810 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007811 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 break;
7815 default:
7816 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 reason, input, &exc,
7818 collstart, collend, &newpos);
7819 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 goto onError;
7821 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007822 repsize = PyUnicode_GET_LENGTH(repunicode);
7823 if (charmaptranslate_makespace(&output, &osize,
7824 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 Py_DECREF(repunicode);
7826 goto onError;
7827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007828 for (uni2 = 0; repsize-->0; ++uni2)
7829 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7830 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 }
7834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7836 if (!res)
7837 goto onError;
7838 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007839 Py_XDECREF(exc);
7840 Py_XDECREF(errorHandler);
7841 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845 Py_XDECREF(exc);
7846 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 return NULL;
7848}
7849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007850/* Deprecated. Use PyUnicode_Translate instead. */
7851PyObject *
7852PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7853 Py_ssize_t size,
7854 PyObject *mapping,
7855 const char *errors)
7856{
7857 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7858 if (!unicode)
7859 return NULL;
7860 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7861}
7862
Alexander Belopolsky40018472011-02-26 01:02:56 +00007863PyObject *
7864PyUnicode_Translate(PyObject *str,
7865 PyObject *mapping,
7866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867{
7868 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007869
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 str = PyUnicode_FromObject(str);
7871 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007873 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 Py_DECREF(str);
7875 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007876
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 Py_XDECREF(str);
7879 return NULL;
7880}
Tim Petersced69f82003-09-16 20:30:58 +00007881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882static Py_UCS4
7883fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7884{
7885 /* No need to call PyUnicode_READY(self) because this function is only
7886 called as a callback from fixup() which does it already. */
7887 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7888 const int kind = PyUnicode_KIND(self);
7889 void *data = PyUnicode_DATA(self);
7890 Py_UCS4 maxchar = 0, ch, fixed;
7891 Py_ssize_t i;
7892
7893 for (i = 0; i < len; ++i) {
7894 ch = PyUnicode_READ(kind, data, i);
7895 fixed = 0;
7896 if (ch > 127) {
7897 if (Py_UNICODE_ISSPACE(ch))
7898 fixed = ' ';
7899 else {
7900 const int decimal = Py_UNICODE_TODECIMAL(ch);
7901 if (decimal >= 0)
7902 fixed = '0' + decimal;
7903 }
7904 if (fixed != 0) {
7905 if (fixed > maxchar)
7906 maxchar = fixed;
7907 PyUnicode_WRITE(kind, data, i, fixed);
7908 }
7909 else if (ch > maxchar)
7910 maxchar = ch;
7911 }
7912 else if (ch > maxchar)
7913 maxchar = ch;
7914 }
7915
7916 return maxchar;
7917}
7918
7919PyObject *
7920_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7921{
7922 if (!PyUnicode_Check(unicode)) {
7923 PyErr_BadInternalCall();
7924 return NULL;
7925 }
7926 if (PyUnicode_READY(unicode) == -1)
7927 return NULL;
7928 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7929 /* If the string is already ASCII, just return the same string */
7930 Py_INCREF(unicode);
7931 return unicode;
7932 }
7933 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7934}
7935
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007936PyObject *
7937PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7938 Py_ssize_t length)
7939{
7940 PyObject *result;
7941 Py_UNICODE *p; /* write pointer into result */
7942 Py_ssize_t i;
7943 /* Copy to a new string */
7944 result = (PyObject *)_PyUnicode_New(length);
7945 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7946 if (result == NULL)
7947 return result;
7948 p = PyUnicode_AS_UNICODE(result);
7949 /* Iterate over code points */
7950 for (i = 0; i < length; i++) {
7951 Py_UNICODE ch =s[i];
7952 if (ch > 127) {
7953 int decimal = Py_UNICODE_TODECIMAL(ch);
7954 if (decimal >= 0)
7955 p[i] = '0' + decimal;
7956 }
7957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007958 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7959 Py_DECREF(result);
7960 return NULL;
7961 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007962 return result;
7963}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007964/* --- Decimal Encoder ---------------------------------------------------- */
7965
Alexander Belopolsky40018472011-02-26 01:02:56 +00007966int
7967PyUnicode_EncodeDecimal(Py_UNICODE *s,
7968 Py_ssize_t length,
7969 char *output,
7970 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007971{
7972 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 PyObject *errorHandler = NULL;
7974 PyObject *exc = NULL;
7975 const char *encoding = "decimal";
7976 const char *reason = "invalid decimal Unicode string";
7977 /* the following variable is used for caching string comparisons
7978 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7979 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007980
7981 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 PyErr_BadArgument();
7983 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007984 }
7985
7986 p = s;
7987 end = s + length;
7988 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 register Py_UNICODE ch = *p;
7990 int decimal;
7991 PyObject *repunicode;
7992 Py_ssize_t repsize;
7993 Py_ssize_t newpos;
7994 Py_UNICODE *uni2;
7995 Py_UNICODE *collstart;
7996 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007997
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007999 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 ++p;
8001 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 decimal = Py_UNICODE_TODECIMAL(ch);
8004 if (decimal >= 0) {
8005 *output++ = '0' + decimal;
8006 ++p;
8007 continue;
8008 }
8009 if (0 < ch && ch < 256) {
8010 *output++ = (char)ch;
8011 ++p;
8012 continue;
8013 }
8014 /* All other characters are considered unencodable */
8015 collstart = p;
8016 collend = p+1;
8017 while (collend < end) {
8018 if ((0 < *collend && *collend < 256) ||
8019 !Py_UNICODE_ISSPACE(*collend) ||
8020 Py_UNICODE_TODECIMAL(*collend))
8021 break;
8022 }
8023 /* cache callback name lookup
8024 * (if not done yet, i.e. it's the first error) */
8025 if (known_errorHandler==-1) {
8026 if ((errors==NULL) || (!strcmp(errors, "strict")))
8027 known_errorHandler = 1;
8028 else if (!strcmp(errors, "replace"))
8029 known_errorHandler = 2;
8030 else if (!strcmp(errors, "ignore"))
8031 known_errorHandler = 3;
8032 else if (!strcmp(errors, "xmlcharrefreplace"))
8033 known_errorHandler = 4;
8034 else
8035 known_errorHandler = 0;
8036 }
8037 switch (known_errorHandler) {
8038 case 1: /* strict */
8039 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8040 goto onError;
8041 case 2: /* replace */
8042 for (p = collstart; p < collend; ++p)
8043 *output++ = '?';
8044 /* fall through */
8045 case 3: /* ignore */
8046 p = collend;
8047 break;
8048 case 4: /* xmlcharrefreplace */
8049 /* generate replacement (temporarily (mis)uses p) */
8050 for (p = collstart; p < collend; ++p)
8051 output += sprintf(output, "&#%d;", (int)*p);
8052 p = collend;
8053 break;
8054 default:
8055 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8056 encoding, reason, s, length, &exc,
8057 collstart-s, collend-s, &newpos);
8058 if (repunicode == NULL)
8059 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008060 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008061 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8063 Py_DECREF(repunicode);
8064 goto onError;
8065 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 /* generate replacement */
8067 repsize = PyUnicode_GET_SIZE(repunicode);
8068 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8069 Py_UNICODE ch = *uni2;
8070 if (Py_UNICODE_ISSPACE(ch))
8071 *output++ = ' ';
8072 else {
8073 decimal = Py_UNICODE_TODECIMAL(ch);
8074 if (decimal >= 0)
8075 *output++ = '0' + decimal;
8076 else if (0 < ch && ch < 256)
8077 *output++ = (char)ch;
8078 else {
8079 Py_DECREF(repunicode);
8080 raise_encode_exception(&exc, encoding,
8081 s, length, collstart-s, collend-s, reason);
8082 goto onError;
8083 }
8084 }
8085 }
8086 p = s + newpos;
8087 Py_DECREF(repunicode);
8088 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008089 }
8090 /* 0-terminate the output string */
8091 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 Py_XDECREF(exc);
8093 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008094 return 0;
8095
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 Py_XDECREF(exc);
8098 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008099 return -1;
8100}
8101
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102/* --- Helpers ------------------------------------------------------------ */
8103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008104#include "stringlib/ucs1lib.h"
8105#include "stringlib/fastsearch.h"
8106#include "stringlib/partition.h"
8107#include "stringlib/split.h"
8108#include "stringlib/count.h"
8109#include "stringlib/find.h"
8110#include "stringlib/localeutil.h"
8111#include "stringlib/undef.h"
8112
8113#include "stringlib/ucs2lib.h"
8114#include "stringlib/fastsearch.h"
8115#include "stringlib/partition.h"
8116#include "stringlib/split.h"
8117#include "stringlib/count.h"
8118#include "stringlib/find.h"
8119#include "stringlib/localeutil.h"
8120#include "stringlib/undef.h"
8121
8122#include "stringlib/ucs4lib.h"
8123#include "stringlib/fastsearch.h"
8124#include "stringlib/partition.h"
8125#include "stringlib/split.h"
8126#include "stringlib/count.h"
8127#include "stringlib/find.h"
8128#include "stringlib/localeutil.h"
8129#include "stringlib/undef.h"
8130
8131static Py_ssize_t
8132any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8133 const Py_UCS1*, Py_ssize_t,
8134 Py_ssize_t, Py_ssize_t),
8135 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8136 const Py_UCS2*, Py_ssize_t,
8137 Py_ssize_t, Py_ssize_t),
8138 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8139 const Py_UCS4*, Py_ssize_t,
8140 Py_ssize_t, Py_ssize_t),
8141 PyObject* s1, PyObject* s2,
8142 Py_ssize_t start,
8143 Py_ssize_t end)
8144{
8145 int kind1, kind2, kind;
8146 void *buf1, *buf2;
8147 Py_ssize_t len1, len2, result;
8148
8149 kind1 = PyUnicode_KIND(s1);
8150 kind2 = PyUnicode_KIND(s2);
8151 kind = kind1 > kind2 ? kind1 : kind2;
8152 buf1 = PyUnicode_DATA(s1);
8153 buf2 = PyUnicode_DATA(s2);
8154 if (kind1 != kind)
8155 buf1 = _PyUnicode_AsKind(s1, kind);
8156 if (!buf1)
8157 return -2;
8158 if (kind2 != kind)
8159 buf2 = _PyUnicode_AsKind(s2, kind);
8160 if (!buf2) {
8161 if (kind1 != kind) PyMem_Free(buf1);
8162 return -2;
8163 }
8164 len1 = PyUnicode_GET_LENGTH(s1);
8165 len2 = PyUnicode_GET_LENGTH(s2);
8166
8167 switch(kind) {
8168 case PyUnicode_1BYTE_KIND:
8169 result = ucs1(buf1, len1, buf2, len2, start, end);
8170 break;
8171 case PyUnicode_2BYTE_KIND:
8172 result = ucs2(buf1, len1, buf2, len2, start, end);
8173 break;
8174 case PyUnicode_4BYTE_KIND:
8175 result = ucs4(buf1, len1, buf2, len2, start, end);
8176 break;
8177 default:
8178 assert(0); result = -2;
8179 }
8180
8181 if (kind1 != kind)
8182 PyMem_Free(buf1);
8183 if (kind2 != kind)
8184 PyMem_Free(buf2);
8185
8186 return result;
8187}
8188
8189Py_ssize_t
8190_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8191 Py_ssize_t n_buffer,
8192 void *digits, Py_ssize_t n_digits,
8193 Py_ssize_t min_width,
8194 const char *grouping,
8195 const char *thousands_sep)
8196{
8197 switch(kind) {
8198 case PyUnicode_1BYTE_KIND:
8199 return _PyUnicode_ucs1_InsertThousandsGrouping(
8200 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8201 min_width, grouping, thousands_sep);
8202 case PyUnicode_2BYTE_KIND:
8203 return _PyUnicode_ucs2_InsertThousandsGrouping(
8204 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8205 min_width, grouping, thousands_sep);
8206 case PyUnicode_4BYTE_KIND:
8207 return _PyUnicode_ucs4_InsertThousandsGrouping(
8208 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8209 min_width, grouping, thousands_sep);
8210 }
8211 assert(0);
8212 return -1;
8213}
8214
8215
Eric Smith8c663262007-08-25 02:26:07 +00008216#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008217#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008218
Thomas Wouters477c8d52006-05-27 19:21:47 +00008219#include "stringlib/count.h"
8220#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008221
Thomas Wouters477c8d52006-05-27 19:21:47 +00008222/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008223#define ADJUST_INDICES(start, end, len) \
8224 if (end > len) \
8225 end = len; \
8226 else if (end < 0) { \
8227 end += len; \
8228 if (end < 0) \
8229 end = 0; \
8230 } \
8231 if (start < 0) { \
8232 start += len; \
8233 if (start < 0) \
8234 start = 0; \
8235 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008236
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237Py_ssize_t
8238PyUnicode_Count(PyObject *str,
8239 PyObject *substr,
8240 Py_ssize_t start,
8241 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008243 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244 PyUnicodeObject* str_obj;
8245 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246 int kind1, kind2, kind;
8247 void *buf1 = NULL, *buf2 = NULL;
8248 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008249
Thomas Wouters477c8d52006-05-27 19:21:47 +00008250 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008254 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 Py_DECREF(str_obj);
8256 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
Tim Petersced69f82003-09-16 20:30:58 +00008258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 kind1 = PyUnicode_KIND(str_obj);
8260 kind2 = PyUnicode_KIND(sub_obj);
8261 kind = kind1 > kind2 ? kind1 : kind2;
8262 buf1 = PyUnicode_DATA(str_obj);
8263 if (kind1 != kind)
8264 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8265 if (!buf1)
8266 goto onError;
8267 buf2 = PyUnicode_DATA(sub_obj);
8268 if (kind2 != kind)
8269 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8270 if (!buf2)
8271 goto onError;
8272 len1 = PyUnicode_GET_LENGTH(str_obj);
8273 len2 = PyUnicode_GET_LENGTH(sub_obj);
8274
8275 ADJUST_INDICES(start, end, len1);
8276 switch(kind) {
8277 case PyUnicode_1BYTE_KIND:
8278 result = ucs1lib_count(
8279 ((Py_UCS1*)buf1) + start, end - start,
8280 buf2, len2, PY_SSIZE_T_MAX
8281 );
8282 break;
8283 case PyUnicode_2BYTE_KIND:
8284 result = ucs2lib_count(
8285 ((Py_UCS2*)buf1) + start, end - start,
8286 buf2, len2, PY_SSIZE_T_MAX
8287 );
8288 break;
8289 case PyUnicode_4BYTE_KIND:
8290 result = ucs4lib_count(
8291 ((Py_UCS4*)buf1) + start, end - start,
8292 buf2, len2, PY_SSIZE_T_MAX
8293 );
8294 break;
8295 default:
8296 assert(0); result = 0;
8297 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008298
8299 Py_DECREF(sub_obj);
8300 Py_DECREF(str_obj);
8301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008302 if (kind1 != kind)
8303 PyMem_Free(buf1);
8304 if (kind2 != kind)
8305 PyMem_Free(buf2);
8306
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 onError:
8309 Py_DECREF(sub_obj);
8310 Py_DECREF(str_obj);
8311 if (kind1 != kind && buf1)
8312 PyMem_Free(buf1);
8313 if (kind2 != kind && buf2)
8314 PyMem_Free(buf2);
8315 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316}
8317
Alexander Belopolsky40018472011-02-26 01:02:56 +00008318Py_ssize_t
8319PyUnicode_Find(PyObject *str,
8320 PyObject *sub,
8321 Py_ssize_t start,
8322 Py_ssize_t end,
8323 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008325 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008326
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008330 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 Py_DECREF(str);
8333 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 }
Tim Petersced69f82003-09-16 20:30:58 +00008335
Thomas Wouters477c8d52006-05-27 19:21:47 +00008336 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 result = any_find_slice(
8338 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8339 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008340 );
8341 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008342 result = any_find_slice(
8343 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8344 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008345 );
8346
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008348 Py_DECREF(sub);
8349
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350 return result;
8351}
8352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353Py_ssize_t
8354PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8355 Py_ssize_t start, Py_ssize_t end,
8356 int direction)
8357{
8358 char *result;
8359 int kind;
8360 if (PyUnicode_READY(str) == -1)
8361 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008362 if (start < 0 || end < 0) {
8363 PyErr_SetString(PyExc_IndexError, "string index out of range");
8364 return -2;
8365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 if (end > PyUnicode_GET_LENGTH(str))
8367 end = PyUnicode_GET_LENGTH(str);
8368 kind = PyUnicode_KIND(str);
8369 result = findchar(PyUnicode_1BYTE_DATA(str)
8370 + PyUnicode_KIND_SIZE(kind, start),
8371 kind,
8372 end-start, ch, direction);
8373 if (!result)
8374 return -1;
8375 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8376}
8377
Alexander Belopolsky40018472011-02-26 01:02:56 +00008378static int
8379tailmatch(PyUnicodeObject *self,
8380 PyUnicodeObject *substring,
8381 Py_ssize_t start,
8382 Py_ssize_t end,
8383 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 int kind_self;
8386 int kind_sub;
8387 void *data_self;
8388 void *data_sub;
8389 Py_ssize_t offset;
8390 Py_ssize_t i;
8391 Py_ssize_t end_sub;
8392
8393 if (PyUnicode_READY(self) == -1 ||
8394 PyUnicode_READY(substring) == -1)
8395 return 0;
8396
8397 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 return 1;
8399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8401 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 kind_self = PyUnicode_KIND(self);
8406 data_self = PyUnicode_DATA(self);
8407 kind_sub = PyUnicode_KIND(substring);
8408 data_sub = PyUnicode_DATA(substring);
8409 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8410
8411 if (direction > 0)
8412 offset = end;
8413 else
8414 offset = start;
8415
8416 if (PyUnicode_READ(kind_self, data_self, offset) ==
8417 PyUnicode_READ(kind_sub, data_sub, 0) &&
8418 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8419 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8420 /* If both are of the same kind, memcmp is sufficient */
8421 if (kind_self == kind_sub) {
8422 return ! memcmp((char *)data_self +
8423 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8424 data_sub,
8425 PyUnicode_GET_LENGTH(substring) *
8426 PyUnicode_CHARACTER_SIZE(substring));
8427 }
8428 /* otherwise we have to compare each character by first accesing it */
8429 else {
8430 /* We do not need to compare 0 and len(substring)-1 because
8431 the if statement above ensured already that they are equal
8432 when we end up here. */
8433 // TODO: honor direction and do a forward or backwards search
8434 for (i = 1; i < end_sub; ++i) {
8435 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8436 PyUnicode_READ(kind_sub, data_sub, i))
8437 return 0;
8438 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008441 }
8442
8443 return 0;
8444}
8445
Alexander Belopolsky40018472011-02-26 01:02:56 +00008446Py_ssize_t
8447PyUnicode_Tailmatch(PyObject *str,
8448 PyObject *substr,
8449 Py_ssize_t start,
8450 Py_ssize_t end,
8451 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008453 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008454
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 str = PyUnicode_FromObject(str);
8456 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 substr = PyUnicode_FromObject(substr);
8459 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 Py_DECREF(str);
8461 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462 }
Tim Petersced69f82003-09-16 20:30:58 +00008463
Guido van Rossumd57fd912000-03-10 22:53:23 +00008464 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 (PyUnicodeObject *)substr,
8466 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008467 Py_DECREF(str);
8468 Py_DECREF(substr);
8469 return result;
8470}
8471
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472/* Apply fixfct filter to the Unicode object self and return a
8473 reference to the modified object */
8474
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475static PyObject *
8476fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 PyObject *u;
8480 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008482 if (PyUnicode_READY(self) == -1)
8483 return NULL;
8484 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8485 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8486 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8491 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008492
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 /* fix functions return the new maximum character in a string,
8494 if the kind of the resulting unicode object does not change,
8495 everything is fine. Otherwise we need to change the string kind
8496 and re-run the fix function. */
8497 maxchar_new = fixfct((PyUnicodeObject*)u);
8498 if (maxchar_new == 0)
8499 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8500 else if (maxchar_new <= 127)
8501 maxchar_new = 127;
8502 else if (maxchar_new <= 255)
8503 maxchar_new = 255;
8504 else if (maxchar_new <= 65535)
8505 maxchar_new = 65535;
8506 else
8507 maxchar_new = 1114111; /* 0x10ffff */
8508
8509 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 /* fixfct should return TRUE if it modified the buffer. If
8511 FALSE, return a reference to the original buffer instead
8512 (to save space, not time) */
8513 Py_INCREF(self);
8514 Py_DECREF(u);
8515 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 else if (maxchar_new == maxchar_old) {
8518 return u;
8519 }
8520 else {
8521 /* In case the maximum character changed, we need to
8522 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008523 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 if (v == NULL) {
8525 Py_DECREF(u);
8526 return NULL;
8527 }
8528 if (maxchar_new > maxchar_old) {
8529 /* If the maxchar increased so that the kind changed, not all
8530 characters are representable anymore and we need to fix the
8531 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008532 if (PyUnicode_CopyCharacters(v, 0,
8533 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008534 PyUnicode_GET_LENGTH(self)) < 0)
8535 {
8536 Py_DECREF(u);
8537 return NULL;
8538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 maxchar_old = fixfct((PyUnicodeObject*)v);
8540 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8541 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008542 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008543 if (PyUnicode_CopyCharacters(v, 0,
8544 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008545 PyUnicode_GET_LENGTH(self)) < 0)
8546 {
8547 Py_DECREF(u);
8548 return NULL;
8549 }
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551
8552 Py_DECREF(u);
8553 return v;
8554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555}
8556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008558fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 /* No need to call PyUnicode_READY(self) because this function is only
8561 called as a callback from fixup() which does it already. */
8562 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8563 const int kind = PyUnicode_KIND(self);
8564 void *data = PyUnicode_DATA(self);
8565 int touched = 0;
8566 Py_UCS4 maxchar = 0;
8567 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 for (i = 0; i < len; ++i) {
8570 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8571 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8572 if (up != ch) {
8573 if (up > maxchar)
8574 maxchar = up;
8575 PyUnicode_WRITE(kind, data, i, up);
8576 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008578 else if (ch > maxchar)
8579 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580 }
8581
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 if (touched)
8583 return maxchar;
8584 else
8585 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586}
8587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008589fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008591 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8592 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8593 const int kind = PyUnicode_KIND(self);
8594 void *data = PyUnicode_DATA(self);
8595 int touched = 0;
8596 Py_UCS4 maxchar = 0;
8597 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 for(i = 0; i < len; ++i) {
8600 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8601 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8602 if (lo != ch) {
8603 if (lo > maxchar)
8604 maxchar = lo;
8605 PyUnicode_WRITE(kind, data, i, lo);
8606 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 else if (ch > maxchar)
8609 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
8611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (touched)
8613 return maxchar;
8614 else
8615 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008619fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8622 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8623 const int kind = PyUnicode_KIND(self);
8624 void *data = PyUnicode_DATA(self);
8625 int touched = 0;
8626 Py_UCS4 maxchar = 0;
8627 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 for(i = 0; i < len; ++i) {
8630 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8631 Py_UCS4 nu = 0;
8632
8633 if (Py_UNICODE_ISUPPER(ch))
8634 nu = Py_UNICODE_TOLOWER(ch);
8635 else if (Py_UNICODE_ISLOWER(ch))
8636 nu = Py_UNICODE_TOUPPER(ch);
8637
8638 if (nu != 0) {
8639 if (nu > maxchar)
8640 maxchar = nu;
8641 PyUnicode_WRITE(kind, data, i, nu);
8642 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 else if (ch > maxchar)
8645 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
8647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 if (touched)
8649 return maxchar;
8650 else
8651 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652}
8653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8658 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8659 const int kind = PyUnicode_KIND(self);
8660 void *data = PyUnicode_DATA(self);
8661 int touched = 0;
8662 Py_UCS4 maxchar = 0;
8663 Py_ssize_t i = 0;
8664 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008665
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008666 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668
8669 ch = PyUnicode_READ(kind, data, i);
8670 if (!Py_UNICODE_ISUPPER(ch)) {
8671 maxchar = Py_UNICODE_TOUPPER(ch);
8672 PyUnicode_WRITE(kind, data, i, maxchar);
8673 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 ++i;
8676 for(; i < len; ++i) {
8677 ch = PyUnicode_READ(kind, data, i);
8678 if (!Py_UNICODE_ISLOWER(ch)) {
8679 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8680 if (lo > maxchar)
8681 maxchar = lo;
8682 PyUnicode_WRITE(kind, data, i, lo);
8683 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 else if (ch > maxchar)
8686 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688
8689 if (touched)
8690 return maxchar;
8691 else
8692 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008693}
8694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008696fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8699 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8700 const int kind = PyUnicode_KIND(self);
8701 void *data = PyUnicode_DATA(self);
8702 Py_UCS4 maxchar = 0;
8703 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 int previous_is_cased;
8705
8706 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 if (len == 1) {
8708 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8709 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8710 if (ti != ch) {
8711 PyUnicode_WRITE(kind, data, i, ti);
8712 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 }
8714 else
8715 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 for(; i < len; ++i) {
8719 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8720 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008721
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 nu = Py_UNICODE_TOTITLE(ch);
8726
8727 if (nu > maxchar)
8728 maxchar = nu;
8729 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008730
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 if (Py_UNICODE_ISLOWER(ch) ||
8732 Py_UNICODE_ISUPPER(ch) ||
8733 Py_UNICODE_ISTITLE(ch))
8734 previous_is_cased = 1;
8735 else
8736 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739}
8740
Tim Peters8ce9f162004-08-27 01:49:32 +00008741PyObject *
8742PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008745 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008747 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008748 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8749 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008750 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751 Py_ssize_t sz, i, res_offset;
8752 Py_UCS4 maxchar = 0;
8753 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754
Tim Peters05eba1f2004-08-27 21:32:02 +00008755 fseq = PySequence_Fast(seq, "");
8756 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008758 }
8759
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008760 /* NOTE: the following code can't call back into Python code,
8761 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008762 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008763
Tim Peters05eba1f2004-08-27 21:32:02 +00008764 seqlen = PySequence_Fast_GET_SIZE(fseq);
8765 /* If empty sequence, return u"". */
8766 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008768 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008769 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008770 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008771 /* If singleton sequence with an exact Unicode, return that. */
8772 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 item = items[0];
8774 if (PyUnicode_CheckExact(item)) {
8775 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 goto Done;
8778 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008779 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008780 else {
8781 /* Set up sep and seplen */
8782 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 /* fall back to a blank space separator */
8784 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008785 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008787 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008788 else {
8789 if (!PyUnicode_Check(separator)) {
8790 PyErr_Format(PyExc_TypeError,
8791 "separator: expected str instance,"
8792 " %.80s found",
8793 Py_TYPE(separator)->tp_name);
8794 goto onError;
8795 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796 if (PyUnicode_READY(separator) == -1)
8797 goto onError;
8798 sep = separator;
8799 seplen = PyUnicode_GET_LENGTH(separator);
8800 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8801 /* inc refcount to keep this code path symetric with the
8802 above case of a blank separator */
8803 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008804 }
8805 }
8806
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008807 /* There are at least two things to join, or else we have a subclass
8808 * of str in the sequence.
8809 * Do a pre-pass to figure out the total amount of space we'll
8810 * need (sz), and see whether all argument are strings.
8811 */
8812 sz = 0;
8813 for (i = 0; i < seqlen; i++) {
8814 const Py_ssize_t old_sz = sz;
8815 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 if (!PyUnicode_Check(item)) {
8817 PyErr_Format(PyExc_TypeError,
8818 "sequence item %zd: expected str instance,"
8819 " %.80s found",
8820 i, Py_TYPE(item)->tp_name);
8821 goto onError;
8822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 if (PyUnicode_READY(item) == -1)
8824 goto onError;
8825 sz += PyUnicode_GET_LENGTH(item);
8826 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8827 if (item_maxchar > maxchar)
8828 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008829 if (i != 0)
8830 sz += seplen;
8831 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8832 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008834 goto onError;
8835 }
8836 }
Tim Petersced69f82003-09-16 20:30:58 +00008837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008839 if (res == NULL)
8840 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008841
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008842 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008844 Py_ssize_t itemlen;
8845 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 /* Copy item, and maybe the separator. */
8848 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008849 if (PyUnicode_CopyCharacters(res, res_offset,
8850 sep, 0, seplen) < 0)
8851 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008852 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008854 if (PyUnicode_CopyCharacters(res, res_offset,
8855 item, 0, itemlen) < 0)
8856 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008860
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008862 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863 Py_XDECREF(sep);
8864 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008867 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008868 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008869 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 return NULL;
8871}
8872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873#define FILL(kind, data, value, start, length) \
8874 do { \
8875 Py_ssize_t i_ = 0; \
8876 assert(kind != PyUnicode_WCHAR_KIND); \
8877 switch ((kind)) { \
8878 case PyUnicode_1BYTE_KIND: { \
8879 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8880 memset(to_, (unsigned char)value, length); \
8881 break; \
8882 } \
8883 case PyUnicode_2BYTE_KIND: { \
8884 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8885 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8886 break; \
8887 } \
8888 default: { \
8889 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8890 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8891 break; \
8892 } \
8893 } \
8894 } while (0)
8895
Alexander Belopolsky40018472011-02-26 01:02:56 +00008896static PyUnicodeObject *
8897pad(PyUnicodeObject *self,
8898 Py_ssize_t left,
8899 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 PyObject *u;
8903 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008904 int kind;
8905 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906
8907 if (left < 0)
8908 left = 0;
8909 if (right < 0)
8910 right = 0;
8911
Tim Peters7a29bd52001-09-12 03:03:31 +00008912 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 Py_INCREF(self);
8914 return self;
8915 }
8916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8918 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008919 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8920 return NULL;
8921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8923 if (fill > maxchar)
8924 maxchar = fill;
8925 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008926 if (!u)
8927 return NULL;
8928
8929 kind = PyUnicode_KIND(u);
8930 data = PyUnicode_DATA(u);
8931 if (left)
8932 FILL(kind, data, fill, 0, left);
8933 if (right)
8934 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008935 if (PyUnicode_CopyCharacters(u, left,
8936 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008937 _PyUnicode_LENGTH(self)) < 0)
8938 {
8939 Py_DECREF(u);
8940 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 }
8942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946
Alexander Belopolsky40018472011-02-26 01:02:56 +00008947PyObject *
8948PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951
8952 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 switch(PyUnicode_KIND(string)) {
8957 case PyUnicode_1BYTE_KIND:
8958 list = ucs1lib_splitlines(
8959 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8960 PyUnicode_GET_LENGTH(string), keepends);
8961 break;
8962 case PyUnicode_2BYTE_KIND:
8963 list = ucs2lib_splitlines(
8964 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8965 PyUnicode_GET_LENGTH(string), keepends);
8966 break;
8967 case PyUnicode_4BYTE_KIND:
8968 list = ucs4lib_splitlines(
8969 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8970 PyUnicode_GET_LENGTH(string), keepends);
8971 break;
8972 default:
8973 assert(0);
8974 list = 0;
8975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976 Py_DECREF(string);
8977 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978}
8979
Alexander Belopolsky40018472011-02-26 01:02:56 +00008980static PyObject *
8981split(PyUnicodeObject *self,
8982 PyUnicodeObject *substring,
8983 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 int kind1, kind2, kind;
8986 void *buf1, *buf2;
8987 Py_ssize_t len1, len2;
8988 PyObject* out;
8989
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008991 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 if (PyUnicode_READY(self) == -1)
8994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008996 if (substring == NULL)
8997 switch(PyUnicode_KIND(self)) {
8998 case PyUnicode_1BYTE_KIND:
8999 return ucs1lib_split_whitespace(
9000 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9001 PyUnicode_GET_LENGTH(self), maxcount
9002 );
9003 case PyUnicode_2BYTE_KIND:
9004 return ucs2lib_split_whitespace(
9005 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9006 PyUnicode_GET_LENGTH(self), maxcount
9007 );
9008 case PyUnicode_4BYTE_KIND:
9009 return ucs4lib_split_whitespace(
9010 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9011 PyUnicode_GET_LENGTH(self), maxcount
9012 );
9013 default:
9014 assert(0);
9015 return NULL;
9016 }
9017
9018 if (PyUnicode_READY(substring) == -1)
9019 return NULL;
9020
9021 kind1 = PyUnicode_KIND(self);
9022 kind2 = PyUnicode_KIND(substring);
9023 kind = kind1 > kind2 ? kind1 : kind2;
9024 buf1 = PyUnicode_DATA(self);
9025 buf2 = PyUnicode_DATA(substring);
9026 if (kind1 != kind)
9027 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9028 if (!buf1)
9029 return NULL;
9030 if (kind2 != kind)
9031 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9032 if (!buf2) {
9033 if (kind1 != kind) PyMem_Free(buf1);
9034 return NULL;
9035 }
9036 len1 = PyUnicode_GET_LENGTH(self);
9037 len2 = PyUnicode_GET_LENGTH(substring);
9038
9039 switch(kind) {
9040 case PyUnicode_1BYTE_KIND:
9041 out = ucs1lib_split(
9042 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9043 break;
9044 case PyUnicode_2BYTE_KIND:
9045 out = ucs2lib_split(
9046 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9047 break;
9048 case PyUnicode_4BYTE_KIND:
9049 out = ucs4lib_split(
9050 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9051 break;
9052 default:
9053 out = NULL;
9054 }
9055 if (kind1 != kind)
9056 PyMem_Free(buf1);
9057 if (kind2 != kind)
9058 PyMem_Free(buf2);
9059 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060}
9061
Alexander Belopolsky40018472011-02-26 01:02:56 +00009062static PyObject *
9063rsplit(PyUnicodeObject *self,
9064 PyUnicodeObject *substring,
9065 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009066{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 int kind1, kind2, kind;
9068 void *buf1, *buf2;
9069 Py_ssize_t len1, len2;
9070 PyObject* out;
9071
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009072 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009073 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 if (PyUnicode_READY(self) == -1)
9076 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 if (substring == NULL)
9079 switch(PyUnicode_KIND(self)) {
9080 case PyUnicode_1BYTE_KIND:
9081 return ucs1lib_rsplit_whitespace(
9082 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9083 PyUnicode_GET_LENGTH(self), maxcount
9084 );
9085 case PyUnicode_2BYTE_KIND:
9086 return ucs2lib_rsplit_whitespace(
9087 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9088 PyUnicode_GET_LENGTH(self), maxcount
9089 );
9090 case PyUnicode_4BYTE_KIND:
9091 return ucs4lib_rsplit_whitespace(
9092 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9093 PyUnicode_GET_LENGTH(self), maxcount
9094 );
9095 default:
9096 assert(0);
9097 return NULL;
9098 }
9099
9100 if (PyUnicode_READY(substring) == -1)
9101 return NULL;
9102
9103 kind1 = PyUnicode_KIND(self);
9104 kind2 = PyUnicode_KIND(substring);
9105 kind = kind1 > kind2 ? kind1 : kind2;
9106 buf1 = PyUnicode_DATA(self);
9107 buf2 = PyUnicode_DATA(substring);
9108 if (kind1 != kind)
9109 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9110 if (!buf1)
9111 return NULL;
9112 if (kind2 != kind)
9113 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9114 if (!buf2) {
9115 if (kind1 != kind) PyMem_Free(buf1);
9116 return NULL;
9117 }
9118 len1 = PyUnicode_GET_LENGTH(self);
9119 len2 = PyUnicode_GET_LENGTH(substring);
9120
9121 switch(kind) {
9122 case PyUnicode_1BYTE_KIND:
9123 out = ucs1lib_rsplit(
9124 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9125 break;
9126 case PyUnicode_2BYTE_KIND:
9127 out = ucs2lib_rsplit(
9128 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9129 break;
9130 case PyUnicode_4BYTE_KIND:
9131 out = ucs4lib_rsplit(
9132 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9133 break;
9134 default:
9135 out = NULL;
9136 }
9137 if (kind1 != kind)
9138 PyMem_Free(buf1);
9139 if (kind2 != kind)
9140 PyMem_Free(buf2);
9141 return out;
9142}
9143
9144static Py_ssize_t
9145anylib_find(int kind, void *buf1, Py_ssize_t len1,
9146 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9147{
9148 switch(kind) {
9149 case PyUnicode_1BYTE_KIND:
9150 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9151 case PyUnicode_2BYTE_KIND:
9152 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9153 case PyUnicode_4BYTE_KIND:
9154 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9155 }
9156 assert(0);
9157 return -1;
9158}
9159
9160static Py_ssize_t
9161anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9162 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9163{
9164 switch(kind) {
9165 case PyUnicode_1BYTE_KIND:
9166 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9167 case PyUnicode_2BYTE_KIND:
9168 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9169 case PyUnicode_4BYTE_KIND:
9170 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9171 }
9172 assert(0);
9173 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009174}
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177replace(PyObject *self, PyObject *str1,
9178 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 PyObject *u;
9181 char *sbuf = PyUnicode_DATA(self);
9182 char *buf1 = PyUnicode_DATA(str1);
9183 char *buf2 = PyUnicode_DATA(str2);
9184 int srelease = 0, release1 = 0, release2 = 0;
9185 int skind = PyUnicode_KIND(self);
9186 int kind1 = PyUnicode_KIND(str1);
9187 int kind2 = PyUnicode_KIND(str2);
9188 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9189 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9190 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009191
9192 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009193 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009195 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (skind < kind1)
9198 /* substring too wide to be present */
9199 goto nothing;
9200
9201 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009202 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009203 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009205 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009207 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009208 Py_UCS4 u1, u2, maxchar;
9209 int mayshrink, rkind;
9210 u1 = PyUnicode_READ_CHAR(str1, 0);
9211 if (!findchar(sbuf, PyUnicode_KIND(self),
9212 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009213 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 u2 = PyUnicode_READ_CHAR(str2, 0);
9215 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9216 /* Replacing u1 with u2 may cause a maxchar reduction in the
9217 result string. */
9218 mayshrink = maxchar > 127;
9219 if (u2 > maxchar) {
9220 maxchar = u2;
9221 mayshrink = 0;
9222 }
9223 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009226 if (PyUnicode_CopyCharacters(u, 0,
9227 (PyObject*)self, 0, slen) < 0)
9228 {
9229 Py_DECREF(u);
9230 return NULL;
9231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232 rkind = PyUnicode_KIND(u);
9233 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9234 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009235 if (--maxcount < 0)
9236 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 if (mayshrink) {
9240 PyObject *tmp = u;
9241 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9242 PyUnicode_GET_LENGTH(tmp));
9243 Py_DECREF(tmp);
9244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009246 int rkind = skind;
9247 char *res;
9248 if (kind1 < rkind) {
9249 /* widen substring */
9250 buf1 = _PyUnicode_AsKind(str1, rkind);
9251 if (!buf1) goto error;
9252 release1 = 1;
9253 }
9254 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009255 if (i < 0)
9256 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (rkind > kind2) {
9258 /* widen replacement */
9259 buf2 = _PyUnicode_AsKind(str2, rkind);
9260 if (!buf2) goto error;
9261 release2 = 1;
9262 }
9263 else if (rkind < kind2) {
9264 /* widen self and buf1 */
9265 rkind = kind2;
9266 if (release1) PyMem_Free(buf1);
9267 sbuf = _PyUnicode_AsKind(self, rkind);
9268 if (!sbuf) goto error;
9269 srelease = 1;
9270 buf1 = _PyUnicode_AsKind(str1, rkind);
9271 if (!buf1) goto error;
9272 release1 = 1;
9273 }
9274 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9275 if (!res) {
9276 PyErr_NoMemory();
9277 goto error;
9278 }
9279 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009280 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9282 buf2,
9283 PyUnicode_KIND_SIZE(rkind, len2));
9284 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009285
9286 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9288 slen-i,
9289 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009290 if (i == -1)
9291 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9293 buf2,
9294 PyUnicode_KIND_SIZE(rkind, len2));
9295 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297
9298 u = PyUnicode_FromKindAndData(rkind, res, slen);
9299 PyMem_Free(res);
9300 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 Py_ssize_t n, i, j, ires;
9305 Py_ssize_t product, new_size;
9306 int rkind = skind;
9307 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 if (kind1 < rkind) {
9310 buf1 = _PyUnicode_AsKind(str1, rkind);
9311 if (!buf1) goto error;
9312 release1 = 1;
9313 }
9314 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009315 if (n == 0)
9316 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 if (kind2 < rkind) {
9318 buf2 = _PyUnicode_AsKind(str2, rkind);
9319 if (!buf2) goto error;
9320 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 else if (kind2 > rkind) {
9323 rkind = kind2;
9324 sbuf = _PyUnicode_AsKind(self, rkind);
9325 if (!sbuf) goto error;
9326 srelease = 1;
9327 if (release1) PyMem_Free(buf1);
9328 buf1 = _PyUnicode_AsKind(str1, rkind);
9329 if (!buf1) goto error;
9330 release1 = 1;
9331 }
9332 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9333 PyUnicode_GET_LENGTH(str1))); */
9334 product = n * (len2-len1);
9335 if ((product / (len2-len1)) != n) {
9336 PyErr_SetString(PyExc_OverflowError,
9337 "replace string is too long");
9338 goto error;
9339 }
9340 new_size = slen + product;
9341 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9342 PyErr_SetString(PyExc_OverflowError,
9343 "replace string is too long");
9344 goto error;
9345 }
9346 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9347 if (!res)
9348 goto error;
9349 ires = i = 0;
9350 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009351 while (n-- > 0) {
9352 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 j = anylib_find(rkind,
9354 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9355 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009356 if (j == -1)
9357 break;
9358 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009359 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9361 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9362 PyUnicode_KIND_SIZE(rkind, j-i));
9363 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364 }
9365 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 if (len2 > 0) {
9367 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9368 buf2,
9369 PyUnicode_KIND_SIZE(rkind, len2));
9370 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009375 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9377 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9378 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009379 } else {
9380 /* interleave */
9381 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9383 buf2,
9384 PyUnicode_KIND_SIZE(rkind, len2));
9385 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009386 if (--n <= 0)
9387 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9389 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9390 PyUnicode_KIND_SIZE(rkind, 1));
9391 ires++;
9392 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9395 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9396 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009399 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (srelease)
9402 PyMem_FREE(sbuf);
9403 if (release1)
9404 PyMem_FREE(buf1);
9405 if (release2)
9406 PyMem_FREE(buf2);
9407 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009408
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009410 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 if (srelease)
9412 PyMem_FREE(sbuf);
9413 if (release1)
9414 PyMem_FREE(buf1);
9415 if (release2)
9416 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009417 if (PyUnicode_CheckExact(self)) {
9418 Py_INCREF(self);
9419 return (PyObject *) self;
9420 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009421 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422 error:
9423 if (srelease && sbuf)
9424 PyMem_FREE(sbuf);
9425 if (release1 && buf1)
9426 PyMem_FREE(buf1);
9427 if (release2 && buf2)
9428 PyMem_FREE(buf2);
9429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430}
9431
9432/* --- Unicode Object Methods --------------------------------------------- */
9433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009434PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436\n\
9437Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009438characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439
9440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009441unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 return fixup(self, fixtitle);
9444}
9445
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009446PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448\n\
9449Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009450have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
9452static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009453unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 return fixup(self, fixcapitalize);
9456}
9457
9458#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009459PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461\n\
9462Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009463normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464
9465static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009466unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
9468 PyObject *list;
9469 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009470 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 /* Split into words */
9473 list = split(self, NULL, -1);
9474 if (!list)
9475 return NULL;
9476
9477 /* Capitalize each word */
9478 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9479 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 if (item == NULL)
9482 goto onError;
9483 Py_DECREF(PyList_GET_ITEM(list, i));
9484 PyList_SET_ITEM(list, i, item);
9485 }
9486
9487 /* Join the words to form a new string */
9488 item = PyUnicode_Join(NULL, list);
9489
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 Py_DECREF(list);
9492 return (PyObject *)item;
9493}
9494#endif
9495
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009496/* Argument converter. Coerces to a single unicode character */
9497
9498static int
9499convert_uc(PyObject *obj, void *addr)
9500{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009503
Benjamin Peterson14339b62009-01-31 16:36:08 +00009504 uniobj = PyUnicode_FromObject(obj);
9505 if (uniobj == NULL) {
9506 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009508 return 0;
9509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009511 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009513 Py_DECREF(uniobj);
9514 return 0;
9515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009517 Py_DECREF(uniobj);
9518 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009519}
9520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009521PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009524Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009525done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526
9527static PyObject *
9528unicode_center(PyUnicodeObject *self, PyObject *args)
9529{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009530 Py_ssize_t marg, left;
9531 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 Py_UCS4 fillchar = ' ';
9533
Victor Stinnere9a29352011-10-01 02:14:59 +02009534 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536
Victor Stinnere9a29352011-10-01 02:14:59 +02009537 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538 return NULL;
9539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 Py_INCREF(self);
9542 return (PyObject*) self;
9543 }
9544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546 left = marg / 2 + (marg & width & 1);
9547
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009548 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009549}
9550
Marc-André Lemburge5034372000-08-08 08:04:29 +00009551#if 0
9552
9553/* This code should go into some future Unicode collation support
9554 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009555 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009556
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009557/* speedy UTF-16 code point order comparison */
9558/* gleaned from: */
9559/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9560
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009561static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009562{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009563 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009564 0, 0, 0, 0, 0, 0, 0, 0,
9565 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009566 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009567};
9568
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569static int
9570unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9571{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009572 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009573
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 Py_UNICODE *s1 = str1->str;
9575 Py_UNICODE *s2 = str2->str;
9576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 len1 = str1->_base._base.length;
9578 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009581 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009582
9583 c1 = *s1++;
9584 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009585
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 if (c1 > (1<<11) * 26)
9587 c1 += utf16Fixup[c1>>11];
9588 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009589 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009590 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009591
9592 if (c1 != c2)
9593 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009594
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009595 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596 }
9597
9598 return (len1 < len2) ? -1 : (len1 != len2);
9599}
9600
Marc-André Lemburge5034372000-08-08 08:04:29 +00009601#else
9602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603/* This function assumes that str1 and str2 are readied by the caller. */
9604
Marc-André Lemburge5034372000-08-08 08:04:29 +00009605static int
9606unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 int kind1, kind2;
9609 void *data1, *data2;
9610 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 kind1 = PyUnicode_KIND(str1);
9613 kind2 = PyUnicode_KIND(str2);
9614 data1 = PyUnicode_DATA(str1);
9615 data2 = PyUnicode_DATA(str2);
9616 len1 = PyUnicode_GET_LENGTH(str1);
9617 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 for (i = 0; i < len1 && i < len2; ++i) {
9620 Py_UCS4 c1, c2;
9621 c1 = PyUnicode_READ(kind1, data1, i);
9622 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009623
9624 if (c1 != c2)
9625 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009626 }
9627
9628 return (len1 < len2) ? -1 : (len1 != len2);
9629}
9630
9631#endif
9632
Alexander Belopolsky40018472011-02-26 01:02:56 +00009633int
9634PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9637 if (PyUnicode_READY(left) == -1 ||
9638 PyUnicode_READY(right) == -1)
9639 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009640 return unicode_compare((PyUnicodeObject *)left,
9641 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009643 PyErr_Format(PyExc_TypeError,
9644 "Can't compare %.100s and %.100s",
9645 left->ob_type->tp_name,
9646 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 return -1;
9648}
9649
Martin v. Löwis5b222132007-06-10 09:51:05 +00009650int
9651PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9652{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 Py_ssize_t i;
9654 int kind;
9655 void *data;
9656 Py_UCS4 chr;
9657
Victor Stinner910337b2011-10-03 03:20:16 +02009658 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 if (PyUnicode_READY(uni) == -1)
9660 return -1;
9661 kind = PyUnicode_KIND(uni);
9662 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009663 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9665 if (chr != str[i])
9666 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009667 /* This check keeps Python strings that end in '\0' from comparing equal
9668 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009671 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009672 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009673 return 0;
9674}
9675
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009676
Benjamin Peterson29060642009-01-31 22:14:21 +00009677#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009678 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009679
Alexander Belopolsky40018472011-02-26 01:02:56 +00009680PyObject *
9681PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009682{
9683 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009685 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9686 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 if (PyUnicode_READY(left) == -1 ||
9688 PyUnicode_READY(right) == -1)
9689 return NULL;
9690 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9691 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009692 if (op == Py_EQ) {
9693 Py_INCREF(Py_False);
9694 return Py_False;
9695 }
9696 if (op == Py_NE) {
9697 Py_INCREF(Py_True);
9698 return Py_True;
9699 }
9700 }
9701 if (left == right)
9702 result = 0;
9703 else
9704 result = unicode_compare((PyUnicodeObject *)left,
9705 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009707 /* Convert the return value to a Boolean */
9708 switch (op) {
9709 case Py_EQ:
9710 v = TEST_COND(result == 0);
9711 break;
9712 case Py_NE:
9713 v = TEST_COND(result != 0);
9714 break;
9715 case Py_LE:
9716 v = TEST_COND(result <= 0);
9717 break;
9718 case Py_GE:
9719 v = TEST_COND(result >= 0);
9720 break;
9721 case Py_LT:
9722 v = TEST_COND(result == -1);
9723 break;
9724 case Py_GT:
9725 v = TEST_COND(result == 1);
9726 break;
9727 default:
9728 PyErr_BadArgument();
9729 return NULL;
9730 }
9731 Py_INCREF(v);
9732 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009734
Brian Curtindfc80e32011-08-10 20:28:54 -05009735 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009736}
9737
Alexander Belopolsky40018472011-02-26 01:02:56 +00009738int
9739PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009740{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009741 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 int kind1, kind2, kind;
9743 void *buf1, *buf2;
9744 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009745 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009746
9747 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009748 sub = PyUnicode_FromObject(element);
9749 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009750 PyErr_Format(PyExc_TypeError,
9751 "'in <string>' requires string as left operand, not %s",
9752 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009753 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 if (PyUnicode_READY(sub) == -1)
9756 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009757
Thomas Wouters477c8d52006-05-27 19:21:47 +00009758 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009759 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009760 Py_DECREF(sub);
9761 return -1;
9762 }
9763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009764 kind1 = PyUnicode_KIND(str);
9765 kind2 = PyUnicode_KIND(sub);
9766 kind = kind1 > kind2 ? kind1 : kind2;
9767 buf1 = PyUnicode_DATA(str);
9768 buf2 = PyUnicode_DATA(sub);
9769 if (kind1 != kind)
9770 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9771 if (!buf1) {
9772 Py_DECREF(sub);
9773 return -1;
9774 }
9775 if (kind2 != kind)
9776 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9777 if (!buf2) {
9778 Py_DECREF(sub);
9779 if (kind1 != kind) PyMem_Free(buf1);
9780 return -1;
9781 }
9782 len1 = PyUnicode_GET_LENGTH(str);
9783 len2 = PyUnicode_GET_LENGTH(sub);
9784
9785 switch(kind) {
9786 case PyUnicode_1BYTE_KIND:
9787 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9788 break;
9789 case PyUnicode_2BYTE_KIND:
9790 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9791 break;
9792 case PyUnicode_4BYTE_KIND:
9793 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9794 break;
9795 default:
9796 result = -1;
9797 assert(0);
9798 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009799
9800 Py_DECREF(str);
9801 Py_DECREF(sub);
9802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 if (kind1 != kind)
9804 PyMem_Free(buf1);
9805 if (kind2 != kind)
9806 PyMem_Free(buf2);
9807
Guido van Rossum403d68b2000-03-13 15:55:09 +00009808 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009809}
9810
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811/* Concat to string or Unicode object giving a new Unicode object. */
9812
Alexander Belopolsky40018472011-02-26 01:02:56 +00009813PyObject *
9814PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 PyObject *u = NULL, *v = NULL, *w;
9817 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818
9819 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009825 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826
9827 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009828 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009832 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
9836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009838 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 w = PyUnicode_New(
9842 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9843 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009846 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9847 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009848 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009849 v, 0,
9850 PyUnicode_GET_LENGTH(v)) < 0)
9851 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 Py_DECREF(u);
9853 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
Benjamin Peterson29060642009-01-31 22:14:21 +00009856 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857 Py_XDECREF(u);
9858 Py_XDECREF(v);
9859 return NULL;
9860}
9861
Walter Dörwald1ab83302007-05-18 17:15:44 +00009862void
Victor Stinner23e56682011-10-03 03:54:37 +02009863PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009864{
Victor Stinner23e56682011-10-03 03:54:37 +02009865 PyObject *left, *res;
9866
9867 if (p_left == NULL) {
9868 if (!PyErr_Occurred())
9869 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009870 return;
9871 }
Victor Stinner23e56682011-10-03 03:54:37 +02009872 left = *p_left;
9873 if (right == NULL || !PyUnicode_Check(left)) {
9874 if (!PyErr_Occurred())
9875 PyErr_BadInternalCall();
9876 goto error;
9877 }
9878
9879 if (PyUnicode_CheckExact(left) && left != unicode_empty
9880 && PyUnicode_CheckExact(right) && right != unicode_empty
9881 && unicode_resizable(left)
9882 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9883 || _PyUnicode_WSTR(left) != NULL))
9884 {
9885 Py_ssize_t u_len, v_len, new_len, copied;
9886
9887 /* FIXME: don't make wstr string ready */
9888 if (PyUnicode_READY(left))
9889 goto error;
9890 if (PyUnicode_READY(right))
9891 goto error;
9892
9893 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9894 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9895 {
9896 u_len = PyUnicode_GET_LENGTH(left);
9897 v_len = PyUnicode_GET_LENGTH(right);
9898 if (u_len > PY_SSIZE_T_MAX - v_len) {
9899 PyErr_SetString(PyExc_OverflowError,
9900 "strings are too large to concat");
9901 goto error;
9902 }
9903 new_len = u_len + v_len;
9904
9905 /* Now we own the last reference to 'left', so we can resize it
9906 * in-place.
9907 */
9908 if (unicode_resize(&left, new_len) != 0) {
9909 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9910 * deallocated so it cannot be put back into
9911 * 'variable'. The MemoryError is raised when there
9912 * is no value in 'variable', which might (very
9913 * remotely) be a cause of incompatibilities.
9914 */
9915 goto error;
9916 }
9917 /* copy 'right' into the newly allocated area of 'left' */
9918 copied = PyUnicode_CopyCharacters(left, u_len,
9919 right, 0,
9920 v_len);
9921 assert(0 <= copied);
9922 *p_left = left;
9923 return;
9924 }
9925 }
9926
9927 res = PyUnicode_Concat(left, right);
9928 if (res == NULL)
9929 goto error;
9930 Py_DECREF(left);
9931 *p_left = res;
9932 return;
9933
9934error:
9935 Py_DECREF(*p_left);
9936 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009937}
9938
9939void
9940PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9941{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009942 PyUnicode_Append(pleft, right);
9943 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009944}
9945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009946PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009949Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009950string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009951interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952
9953static PyObject *
9954unicode_count(PyUnicodeObject *self, PyObject *args)
9955{
9956 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009957 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009958 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 int kind1, kind2, kind;
9961 void *buf1, *buf2;
9962 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963
Jesus Ceaac451502011-04-20 17:09:23 +02009964 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9965 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009966 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 kind1 = PyUnicode_KIND(self);
9969 kind2 = PyUnicode_KIND(substring);
9970 kind = kind1 > kind2 ? kind1 : kind2;
9971 buf1 = PyUnicode_DATA(self);
9972 buf2 = PyUnicode_DATA(substring);
9973 if (kind1 != kind)
9974 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9975 if (!buf1) {
9976 Py_DECREF(substring);
9977 return NULL;
9978 }
9979 if (kind2 != kind)
9980 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9981 if (!buf2) {
9982 Py_DECREF(substring);
9983 if (kind1 != kind) PyMem_Free(buf1);
9984 return NULL;
9985 }
9986 len1 = PyUnicode_GET_LENGTH(self);
9987 len2 = PyUnicode_GET_LENGTH(substring);
9988
9989 ADJUST_INDICES(start, end, len1);
9990 switch(kind) {
9991 case PyUnicode_1BYTE_KIND:
9992 iresult = ucs1lib_count(
9993 ((Py_UCS1*)buf1) + start, end - start,
9994 buf2, len2, PY_SSIZE_T_MAX
9995 );
9996 break;
9997 case PyUnicode_2BYTE_KIND:
9998 iresult = ucs2lib_count(
9999 ((Py_UCS2*)buf1) + start, end - start,
10000 buf2, len2, PY_SSIZE_T_MAX
10001 );
10002 break;
10003 case PyUnicode_4BYTE_KIND:
10004 iresult = ucs4lib_count(
10005 ((Py_UCS4*)buf1) + start, end - start,
10006 buf2, len2, PY_SSIZE_T_MAX
10007 );
10008 break;
10009 default:
10010 assert(0); iresult = 0;
10011 }
10012
10013 result = PyLong_FromSsize_t(iresult);
10014
10015 if (kind1 != kind)
10016 PyMem_Free(buf1);
10017 if (kind2 != kind)
10018 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
10020 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010021
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022 return result;
10023}
10024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010025PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010026 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010028Encode S using the codec registered for encoding. Default encoding\n\
10029is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010030handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010031a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10032'xmlcharrefreplace' as well as any other name registered with\n\
10033codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
10035static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010036unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010038 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039 char *encoding = NULL;
10040 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010041
Benjamin Peterson308d6372009-09-18 21:42:35 +000010042 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10043 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010045 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010046}
10047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010048PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010049 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050\n\
10051Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010052If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053
10054static PyObject*
10055unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10056{
10057 Py_UNICODE *e;
10058 Py_UNICODE *p;
10059 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010060 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010062 PyUnicodeObject *u;
10063 int tabsize = 8;
10064
10065 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10069 return NULL;
10070
Thomas Wouters7e474022000-07-16 12:04:32 +000010071 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010072 i = 0; /* chars up to and including most recent \n or \r */
10073 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10075 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 if (tabsize > 0) {
10078 incr = tabsize - (j % tabsize); /* cannot overflow */
10079 if (j > PY_SSIZE_T_MAX - incr)
10080 goto overflow1;
10081 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010082 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 if (j > PY_SSIZE_T_MAX - 1)
10086 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087 j++;
10088 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010089 if (i > PY_SSIZE_T_MAX - j)
10090 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010092 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093 }
10094 }
10095
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010096 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010098
Guido van Rossumd57fd912000-03-10 22:53:23 +000010099 /* Second pass: create output string and fill it */
10100 u = _PyUnicode_New(i + j);
10101 if (!u)
10102 return NULL;
10103
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010104 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 q = _PyUnicode_WSTR(u); /* next output char */
10106 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010109 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010110 if (tabsize > 0) {
10111 i = tabsize - (j % tabsize);
10112 j += i;
10113 while (i--) {
10114 if (q >= qe)
10115 goto overflow2;
10116 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010117 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010119 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010120 else {
10121 if (q >= qe)
10122 goto overflow2;
10123 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010124 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 if (*p == '\n' || *p == '\r')
10126 j = 0;
10127 }
10128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (PyUnicode_READY(u) == -1) {
10130 Py_DECREF(u);
10131 return NULL;
10132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010134
10135 overflow2:
10136 Py_DECREF(u);
10137 overflow1:
10138 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140}
10141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010142PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144\n\
10145Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010146such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147arguments start and end are interpreted as in slice notation.\n\
10148\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010149Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
10151static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153{
Jesus Ceaac451502011-04-20 17:09:23 +020010154 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010155 Py_ssize_t start;
10156 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158
Jesus Ceaac451502011-04-20 17:09:23 +020010159 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10160 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (PyUnicode_READY(self) == -1)
10164 return NULL;
10165 if (PyUnicode_READY(substring) == -1)
10166 return NULL;
10167
10168 result = any_find_slice(
10169 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10170 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010171 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
10173 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (result == -2)
10176 return NULL;
10177
Christian Heimes217cfd12007-12-02 14:31:20 +000010178 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179}
10180
10181static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010182unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010184 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10185 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188}
10189
Guido van Rossumc2504932007-09-18 19:42:40 +000010190/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010191 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010192static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010193unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194{
Guido van Rossumc2504932007-09-18 19:42:40 +000010195 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010196 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (_PyUnicode_HASH(self) != -1)
10199 return _PyUnicode_HASH(self);
10200 if (PyUnicode_READY(self) == -1)
10201 return -1;
10202 len = PyUnicode_GET_LENGTH(self);
10203
10204 /* The hash function as a macro, gets expanded three times below. */
10205#define HASH(P) \
10206 x = (Py_uhash_t)*P << 7; \
10207 while (--len >= 0) \
10208 x = (1000003*x) ^ (Py_uhash_t)*P++;
10209
10210 switch (PyUnicode_KIND(self)) {
10211 case PyUnicode_1BYTE_KIND: {
10212 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10213 HASH(c);
10214 break;
10215 }
10216 case PyUnicode_2BYTE_KIND: {
10217 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10218 HASH(s);
10219 break;
10220 }
10221 default: {
10222 Py_UCS4 *l;
10223 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10224 "Impossible switch case in unicode_hash");
10225 l = PyUnicode_4BYTE_DATA(self);
10226 HASH(l);
10227 break;
10228 }
10229 }
10230 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10231
Guido van Rossumc2504932007-09-18 19:42:40 +000010232 if (x == -1)
10233 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010235 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010239PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010240 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010242Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
10244static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010247 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010248 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010249 Py_ssize_t start;
10250 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Jesus Ceaac451502011-04-20 17:09:23 +020010252 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10253 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (PyUnicode_READY(self) == -1)
10257 return NULL;
10258 if (PyUnicode_READY(substring) == -1)
10259 return NULL;
10260
10261 result = any_find_slice(
10262 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10263 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010264 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265
10266 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (result == -2)
10269 return NULL;
10270
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271 if (result < 0) {
10272 PyErr_SetString(PyExc_ValueError, "substring not found");
10273 return NULL;
10274 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010275
Christian Heimes217cfd12007-12-02 14:31:20 +000010276 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277}
10278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010279PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010282Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010283at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284
10285static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010286unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 Py_ssize_t i, length;
10289 int kind;
10290 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 int cased;
10292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (PyUnicode_READY(self) == -1)
10294 return NULL;
10295 length = PyUnicode_GET_LENGTH(self);
10296 kind = PyUnicode_KIND(self);
10297 data = PyUnicode_DATA(self);
10298
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (length == 1)
10301 return PyBool_FromLong(
10302 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010304 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010307
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 for (i = 0; i < length; i++) {
10310 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010311
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10313 return PyBool_FromLong(0);
10314 else if (!cased && Py_UNICODE_ISLOWER(ch))
10315 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010317 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318}
10319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010320PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010321 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010323Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010324at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325
10326static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010327unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 Py_ssize_t i, length;
10330 int kind;
10331 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 int cased;
10333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (PyUnicode_READY(self) == -1)
10335 return NULL;
10336 length = PyUnicode_GET_LENGTH(self);
10337 kind = PyUnicode_KIND(self);
10338 data = PyUnicode_DATA(self);
10339
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 if (length == 1)
10342 return PyBool_FromLong(
10343 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010345 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010347 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010348
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 for (i = 0; i < length; i++) {
10351 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010352
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10354 return PyBool_FromLong(0);
10355 else if (!cased && Py_UNICODE_ISUPPER(ch))
10356 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010357 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010358 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359}
10360
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010361PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010364Return True if S is a titlecased string and there is at least one\n\
10365character in S, i.e. upper- and titlecase characters may only\n\
10366follow uncased characters and lowercase characters only cased ones.\n\
10367Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
10369static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010370unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 Py_ssize_t i, length;
10373 int kind;
10374 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 int cased, previous_is_cased;
10376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (PyUnicode_READY(self) == -1)
10378 return NULL;
10379 length = PyUnicode_GET_LENGTH(self);
10380 kind = PyUnicode_KIND(self);
10381 data = PyUnicode_DATA(self);
10382
Guido van Rossumd57fd912000-03-10 22:53:23 +000010383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 if (length == 1) {
10385 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10386 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10387 (Py_UNICODE_ISUPPER(ch) != 0));
10388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010390 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010391 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010393
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 cased = 0;
10395 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 for (i = 0; i < length; i++) {
10397 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010398
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10400 if (previous_is_cased)
10401 return PyBool_FromLong(0);
10402 previous_is_cased = 1;
10403 cased = 1;
10404 }
10405 else if (Py_UNICODE_ISLOWER(ch)) {
10406 if (!previous_is_cased)
10407 return PyBool_FromLong(0);
10408 previous_is_cased = 1;
10409 cased = 1;
10410 }
10411 else
10412 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010414 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415}
10416
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010417PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010420Return True if all characters in S are whitespace\n\
10421and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422
10423static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010424unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 Py_ssize_t i, length;
10427 int kind;
10428 void *data;
10429
10430 if (PyUnicode_READY(self) == -1)
10431 return NULL;
10432 length = PyUnicode_GET_LENGTH(self);
10433 kind = PyUnicode_KIND(self);
10434 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 if (length == 1)
10438 return PyBool_FromLong(
10439 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010441 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010443 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 for (i = 0; i < length; i++) {
10446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010447 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010448 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010450 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010451}
10452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010453PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010455\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010456Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010457and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010458
10459static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010460unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 Py_ssize_t i, length;
10463 int kind;
10464 void *data;
10465
10466 if (PyUnicode_READY(self) == -1)
10467 return NULL;
10468 length = PyUnicode_GET_LENGTH(self);
10469 kind = PyUnicode_KIND(self);
10470 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010471
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010472 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (length == 1)
10474 return PyBool_FromLong(
10475 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010476
10477 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 for (i = 0; i < length; i++) {
10482 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010483 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010484 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010485 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010486}
10487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010488PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010489 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010490\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010491Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010492and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010493
10494static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010495unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 int kind;
10498 void *data;
10499 Py_ssize_t len, i;
10500
10501 if (PyUnicode_READY(self) == -1)
10502 return NULL;
10503
10504 kind = PyUnicode_KIND(self);
10505 data = PyUnicode_DATA(self);
10506 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010507
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010508 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 if (len == 1) {
10510 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10511 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10512 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010513
10514 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 for (i = 0; i < len; i++) {
10519 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010520 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010522 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010523 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010524}
10525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010526PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010529Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010530False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
10532static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010533unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010534{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010535 Py_ssize_t i, length;
10536 int kind;
10537 void *data;
10538
10539 if (PyUnicode_READY(self) == -1)
10540 return NULL;
10541 length = PyUnicode_GET_LENGTH(self);
10542 kind = PyUnicode_KIND(self);
10543 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (length == 1)
10547 return PyBool_FromLong(
10548 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010550 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010552 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 for (i = 0; i < length; i++) {
10555 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010556 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010558 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559}
10560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010561PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010562 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010564Return True if all characters in S are digits\n\
10565and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
10567static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010568unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_ssize_t i, length;
10571 int kind;
10572 void *data;
10573
10574 if (PyUnicode_READY(self) == -1)
10575 return NULL;
10576 length = PyUnicode_GET_LENGTH(self);
10577 kind = PyUnicode_KIND(self);
10578 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579
Guido van Rossumd57fd912000-03-10 22:53:23 +000010580 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (length == 1) {
10582 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10583 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010586 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 for (i = 0; i < length; i++) {
10591 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010594 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595}
10596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010597PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010598 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010600Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010601False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
10603static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010604unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 Py_ssize_t i, length;
10607 int kind;
10608 void *data;
10609
10610 if (PyUnicode_READY(self) == -1)
10611 return NULL;
10612 length = PyUnicode_GET_LENGTH(self);
10613 kind = PyUnicode_KIND(self);
10614 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (length == 1)
10618 return PyBool_FromLong(
10619 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010621 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010623 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 for (i = 0; i < length; i++) {
10626 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010627 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630}
10631
Martin v. Löwis47383402007-08-15 07:32:56 +000010632int
10633PyUnicode_IsIdentifier(PyObject *self)
10634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 int kind;
10636 void *data;
10637 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010638 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (PyUnicode_READY(self) == -1) {
10641 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 }
10644
10645 /* Special case for empty strings */
10646 if (PyUnicode_GET_LENGTH(self) == 0)
10647 return 0;
10648 kind = PyUnicode_KIND(self);
10649 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010650
10651 /* PEP 3131 says that the first character must be in
10652 XID_Start and subsequent characters in XID_Continue,
10653 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010654 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010655 letters, digits, underscore). However, given the current
10656 definition of XID_Start and XID_Continue, it is sufficient
10657 to check just for these, except that _ must be allowed
10658 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010660 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010661 return 0;
10662
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010663 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010665 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010666 return 1;
10667}
10668
10669PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010671\n\
10672Return True if S is a valid identifier according\n\
10673to the language definition.");
10674
10675static PyObject*
10676unicode_isidentifier(PyObject *self)
10677{
10678 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10679}
10680
Georg Brandl559e5d72008-06-11 18:37:52 +000010681PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010683\n\
10684Return True if all characters in S are considered\n\
10685printable in repr() or S is empty, False otherwise.");
10686
10687static PyObject*
10688unicode_isprintable(PyObject *self)
10689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 Py_ssize_t i, length;
10691 int kind;
10692 void *data;
10693
10694 if (PyUnicode_READY(self) == -1)
10695 return NULL;
10696 length = PyUnicode_GET_LENGTH(self);
10697 kind = PyUnicode_KIND(self);
10698 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010699
10700 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 if (length == 1)
10702 return PyBool_FromLong(
10703 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 for (i = 0; i < length; i++) {
10706 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010707 Py_RETURN_FALSE;
10708 }
10709 }
10710 Py_RETURN_TRUE;
10711}
10712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010713PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010714 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715\n\
10716Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010717iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
10719static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010720unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010722 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
Martin v. Löwis18e16552006-02-15 17:27:45 +000010725static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726unicode_length(PyUnicodeObject *self)
10727{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (PyUnicode_READY(self) == -1)
10729 return -1;
10730 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731}
10732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010733PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010736Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010737done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738
10739static PyObject *
10740unicode_ljust(PyUnicodeObject *self, PyObject *args)
10741{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010742 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 Py_UCS4 fillchar = ' ';
10744
10745 if (PyUnicode_READY(self) == -1)
10746 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010747
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010748 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 return NULL;
10750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752 Py_INCREF(self);
10753 return (PyObject*) self;
10754 }
10755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757}
10758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010759PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010762Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
10764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010765unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767 return fixup(self, fixlower);
10768}
10769
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010770#define LEFTSTRIP 0
10771#define RIGHTSTRIP 1
10772#define BOTHSTRIP 2
10773
10774/* Arrays indexed by above */
10775static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10776
10777#define STRIPNAME(i) (stripformat[i]+3)
10778
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010779/* externally visible for str.strip(unicode) */
10780PyObject *
10781_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10782{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 void *data;
10784 int kind;
10785 Py_ssize_t i, j, len;
10786 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10789 return NULL;
10790
10791 kind = PyUnicode_KIND(self);
10792 data = PyUnicode_DATA(self);
10793 len = PyUnicode_GET_LENGTH(self);
10794 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10795 PyUnicode_DATA(sepobj),
10796 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010797
Benjamin Peterson14339b62009-01-31 16:36:08 +000010798 i = 0;
10799 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 while (i < len &&
10801 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010802 i++;
10803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010805
Benjamin Peterson14339b62009-01-31 16:36:08 +000010806 j = len;
10807 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010808 do {
10809 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 } while (j >= i &&
10811 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010814
Victor Stinner12bab6d2011-10-01 01:53:49 +020010815 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816}
10817
10818PyObject*
10819PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10820{
10821 unsigned char *data;
10822 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010823 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824
Victor Stinnerde636f32011-10-01 03:55:54 +020010825 if (PyUnicode_READY(self) == -1)
10826 return NULL;
10827
10828 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10829
Victor Stinner12bab6d2011-10-01 01:53:49 +020010830 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010832 if (PyUnicode_CheckExact(self)) {
10833 Py_INCREF(self);
10834 return self;
10835 }
10836 else
10837 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 }
10839
Victor Stinner12bab6d2011-10-01 01:53:49 +020010840 length = end - start;
10841 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010842 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843
Victor Stinnerde636f32011-10-01 03:55:54 +020010844 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010845 PyErr_SetString(PyExc_IndexError, "string index out of range");
10846 return NULL;
10847 }
10848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849 kind = PyUnicode_KIND(self);
10850 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010851 return PyUnicode_FromKindAndData(kind,
10852 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010853 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010857do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 int kind;
10860 void *data;
10861 Py_ssize_t len, i, j;
10862
10863 if (PyUnicode_READY(self) == -1)
10864 return NULL;
10865
10866 kind = PyUnicode_KIND(self);
10867 data = PyUnicode_DATA(self);
10868 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010869
Benjamin Peterson14339b62009-01-31 16:36:08 +000010870 i = 0;
10871 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 i++;
10874 }
10875 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010876
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877 j = len;
10878 if (striptype != LEFTSTRIP) {
10879 do {
10880 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 j++;
10883 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010884
Victor Stinner12bab6d2011-10-01 01:53:49 +020010885 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886}
10887
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010888
10889static PyObject *
10890do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10891{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010893
Benjamin Peterson14339b62009-01-31 16:36:08 +000010894 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10895 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010896
Benjamin Peterson14339b62009-01-31 16:36:08 +000010897 if (sep != NULL && sep != Py_None) {
10898 if (PyUnicode_Check(sep))
10899 return _PyUnicode_XStrip(self, striptype, sep);
10900 else {
10901 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 "%s arg must be None or str",
10903 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010904 return NULL;
10905 }
10906 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010907
Benjamin Peterson14339b62009-01-31 16:36:08 +000010908 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010909}
10910
10911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010914\n\
10915Return a copy of the string S with leading and trailing\n\
10916whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010917If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010918
10919static PyObject *
10920unicode_strip(PyUnicodeObject *self, PyObject *args)
10921{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010922 if (PyTuple_GET_SIZE(args) == 0)
10923 return do_strip(self, BOTHSTRIP); /* Common case */
10924 else
10925 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010926}
10927
10928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010929PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010930 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010931\n\
10932Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010933If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010934
10935static PyObject *
10936unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10937{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010938 if (PyTuple_GET_SIZE(args) == 0)
10939 return do_strip(self, LEFTSTRIP); /* Common case */
10940 else
10941 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010942}
10943
10944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010946 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010947\n\
10948Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010949If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010950
10951static PyObject *
10952unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10953{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010954 if (PyTuple_GET_SIZE(args) == 0)
10955 return do_strip(self, RIGHTSTRIP); /* Common case */
10956 else
10957 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010958}
10959
10960
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010962unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963{
10964 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Georg Brandl222de0f2009-04-12 12:01:50 +000010967 if (len < 1) {
10968 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010969 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971
Tim Peters7a29bd52001-09-12 03:03:31 +000010972 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973 /* no repeat, return original string */
10974 Py_INCREF(str);
10975 return (PyObject*) str;
10976 }
Tim Peters8f422462000-09-09 06:13:41 +000010977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 if (PyUnicode_READY(str) == -1)
10979 return NULL;
10980
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010981 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010982 PyErr_SetString(PyExc_OverflowError,
10983 "repeated string is too long");
10984 return NULL;
10985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 if (!u)
10990 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010991 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (PyUnicode_GET_LENGTH(str) == 1) {
10994 const int kind = PyUnicode_KIND(str);
10995 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10996 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010997 if (kind == PyUnicode_1BYTE_KIND)
10998 memset(to, (unsigned char)fill_char, len);
10999 else {
11000 for (n = 0; n < len; ++n)
11001 PyUnicode_WRITE(kind, to, n, fill_char);
11002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 }
11004 else {
11005 /* number of characters copied this far */
11006 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11007 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11008 char *to = (char *) PyUnicode_DATA(u);
11009 Py_MEMCPY(to, PyUnicode_DATA(str),
11010 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011012 n = (done <= nchars-done) ? done : nchars-done;
11013 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011014 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 }
11017
11018 return (PyObject*) u;
11019}
11020
Alexander Belopolsky40018472011-02-26 01:02:56 +000011021PyObject *
11022PyUnicode_Replace(PyObject *obj,
11023 PyObject *subobj,
11024 PyObject *replobj,
11025 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026{
11027 PyObject *self;
11028 PyObject *str1;
11029 PyObject *str2;
11030 PyObject *result;
11031
11032 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011033 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011036 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011037 Py_DECREF(self);
11038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 }
11040 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011041 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011042 Py_DECREF(self);
11043 Py_DECREF(str1);
11044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047 Py_DECREF(self);
11048 Py_DECREF(str1);
11049 Py_DECREF(str2);
11050 return result;
11051}
11052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011053PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011054 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055\n\
11056Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011057old replaced by new. If the optional argument count is\n\
11058given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
11060static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 PyObject *str1;
11064 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011065 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066 PyObject *result;
11067
Martin v. Löwis18e16552006-02-15 17:27:45 +000011068 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011070 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011071 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 str1 = PyUnicode_FromObject(str1);
11073 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11074 return NULL;
11075 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011076 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011077 Py_DECREF(str1);
11078 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
11081 result = replace(self, str1, str2, maxcount);
11082
11083 Py_DECREF(str1);
11084 Py_DECREF(str2);
11085 return result;
11086}
11087
Alexander Belopolsky40018472011-02-26 01:02:56 +000011088static PyObject *
11089unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011091 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 Py_ssize_t isize;
11093 Py_ssize_t osize, squote, dquote, i, o;
11094 Py_UCS4 max, quote;
11095 int ikind, okind;
11096 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011099 return NULL;
11100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 isize = PyUnicode_GET_LENGTH(unicode);
11102 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 /* Compute length of output, quote characters, and
11105 maximum character */
11106 osize = 2; /* quotes */
11107 max = 127;
11108 squote = dquote = 0;
11109 ikind = PyUnicode_KIND(unicode);
11110 for (i = 0; i < isize; i++) {
11111 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11112 switch (ch) {
11113 case '\'': squote++; osize++; break;
11114 case '"': dquote++; osize++; break;
11115 case '\\': case '\t': case '\r': case '\n':
11116 osize += 2; break;
11117 default:
11118 /* Fast-path ASCII */
11119 if (ch < ' ' || ch == 0x7f)
11120 osize += 4; /* \xHH */
11121 else if (ch < 0x7f)
11122 osize++;
11123 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11124 osize++;
11125 max = ch > max ? ch : max;
11126 }
11127 else if (ch < 0x100)
11128 osize += 4; /* \xHH */
11129 else if (ch < 0x10000)
11130 osize += 6; /* \uHHHH */
11131 else
11132 osize += 10; /* \uHHHHHHHH */
11133 }
11134 }
11135
11136 quote = '\'';
11137 if (squote) {
11138 if (dquote)
11139 /* Both squote and dquote present. Use squote,
11140 and escape them */
11141 osize += squote;
11142 else
11143 quote = '"';
11144 }
11145
11146 repr = PyUnicode_New(osize, max);
11147 if (repr == NULL)
11148 return NULL;
11149 okind = PyUnicode_KIND(repr);
11150 odata = PyUnicode_DATA(repr);
11151
11152 PyUnicode_WRITE(okind, odata, 0, quote);
11153 PyUnicode_WRITE(okind, odata, osize-1, quote);
11154
11155 for (i = 0, o = 1; i < isize; i++) {
11156 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011157
11158 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if ((ch == quote) || (ch == '\\')) {
11160 PyUnicode_WRITE(okind, odata, o++, '\\');
11161 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011162 continue;
11163 }
11164
Benjamin Peterson29060642009-01-31 22:14:21 +000011165 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011166 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 PyUnicode_WRITE(okind, odata, o++, '\\');
11168 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011169 }
11170 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 PyUnicode_WRITE(okind, odata, o++, '\\');
11172 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011173 }
11174 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 PyUnicode_WRITE(okind, odata, o++, '\\');
11176 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011177 }
11178
11179 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011180 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 PyUnicode_WRITE(okind, odata, o++, '\\');
11182 PyUnicode_WRITE(okind, odata, o++, 'x');
11183 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11184 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011185 }
11186
Georg Brandl559e5d72008-06-11 18:37:52 +000011187 /* Copy ASCII characters as-is */
11188 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011190 }
11191
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011193 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011194 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011195 (categories Z* and C* except ASCII space)
11196 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011198 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 if (ch <= 0xff) {
11200 PyUnicode_WRITE(okind, odata, o++, '\\');
11201 PyUnicode_WRITE(okind, odata, o++, 'x');
11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011204 }
11205 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 else if (ch >= 0x10000) {
11207 PyUnicode_WRITE(okind, odata, o++, '\\');
11208 PyUnicode_WRITE(okind, odata, o++, 'U');
11209 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11210 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11211 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11212 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11213 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11214 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11215 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11216 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011217 }
11218 /* Map 16-bit characters to '\uxxxx' */
11219 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 PyUnicode_WRITE(okind, odata, o++, '\\');
11221 PyUnicode_WRITE(okind, odata, o++, 'u');
11222 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11223 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11224 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11225 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011226 }
11227 }
11228 /* Copy characters as-is */
11229 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011231 }
11232 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011233 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236}
11237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011238PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240\n\
11241Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011242such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243arguments start and end are interpreted as in slice notation.\n\
11244\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
11247static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249{
Jesus Ceaac451502011-04-20 17:09:23 +020011250 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011251 Py_ssize_t start;
11252 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011253 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
Jesus Ceaac451502011-04-20 17:09:23 +020011255 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11256 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 if (PyUnicode_READY(self) == -1)
11260 return NULL;
11261 if (PyUnicode_READY(substring) == -1)
11262 return NULL;
11263
11264 result = any_find_slice(
11265 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11266 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011267 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268
11269 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 if (result == -2)
11272 return NULL;
11273
Christian Heimes217cfd12007-12-02 14:31:20 +000011274 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275}
11276
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011277PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011278 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284{
Jesus Ceaac451502011-04-20 17:09:23 +020011285 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011286 Py_ssize_t start;
11287 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011288 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
Jesus Ceaac451502011-04-20 17:09:23 +020011290 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11291 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (PyUnicode_READY(self) == -1)
11295 return NULL;
11296 if (PyUnicode_READY(substring) == -1)
11297 return NULL;
11298
11299 result = any_find_slice(
11300 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11301 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011302 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303
11304 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (result == -2)
11307 return NULL;
11308
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 if (result < 0) {
11310 PyErr_SetString(PyExc_ValueError, "substring not found");
11311 return NULL;
11312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313
Christian Heimes217cfd12007-12-02 14:31:20 +000011314 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011318 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011320Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011321done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject *
11324unicode_rjust(PyUnicodeObject *self, PyObject *args)
11325{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011326 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011327 Py_UCS4 fillchar = ' ';
11328
Victor Stinnere9a29352011-10-01 02:14:59 +020011329 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011331
Victor Stinnere9a29352011-10-01 02:14:59 +020011332 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333 return NULL;
11334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336 Py_INCREF(self);
11337 return (PyObject*) self;
11338 }
11339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341}
11342
Alexander Belopolsky40018472011-02-26 01:02:56 +000011343PyObject *
11344PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345{
11346 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011347
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 s = PyUnicode_FromObject(s);
11349 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011350 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011351 if (sep != NULL) {
11352 sep = PyUnicode_FromObject(sep);
11353 if (sep == NULL) {
11354 Py_DECREF(s);
11355 return NULL;
11356 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357 }
11358
11359 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11360
11361 Py_DECREF(s);
11362 Py_XDECREF(sep);
11363 return result;
11364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368\n\
11369Return a list of the words in S, using sep as the\n\
11370delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011371splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011372whitespace string is a separator and empty strings are\n\
11373removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
11375static PyObject*
11376unicode_split(PyUnicodeObject *self, PyObject *args)
11377{
11378 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011379 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380
Martin v. Löwis18e16552006-02-15 17:27:45 +000011381 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 return NULL;
11383
11384 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011387 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390}
11391
Thomas Wouters477c8d52006-05-27 19:21:47 +000011392PyObject *
11393PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11394{
11395 PyObject* str_obj;
11396 PyObject* sep_obj;
11397 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 int kind1, kind2, kind;
11399 void *buf1 = NULL, *buf2 = NULL;
11400 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011401
11402 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011403 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011405 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011407 Py_DECREF(str_obj);
11408 return NULL;
11409 }
11410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 kind1 = PyUnicode_KIND(str_in);
11412 kind2 = PyUnicode_KIND(sep_obj);
11413 kind = kind1 > kind2 ? kind1 : kind2;
11414 buf1 = PyUnicode_DATA(str_in);
11415 if (kind1 != kind)
11416 buf1 = _PyUnicode_AsKind(str_in, kind);
11417 if (!buf1)
11418 goto onError;
11419 buf2 = PyUnicode_DATA(sep_obj);
11420 if (kind2 != kind)
11421 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11422 if (!buf2)
11423 goto onError;
11424 len1 = PyUnicode_GET_LENGTH(str_obj);
11425 len2 = PyUnicode_GET_LENGTH(sep_obj);
11426
11427 switch(PyUnicode_KIND(str_in)) {
11428 case PyUnicode_1BYTE_KIND:
11429 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11430 break;
11431 case PyUnicode_2BYTE_KIND:
11432 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11433 break;
11434 case PyUnicode_4BYTE_KIND:
11435 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11436 break;
11437 default:
11438 assert(0);
11439 out = 0;
11440 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011441
11442 Py_DECREF(sep_obj);
11443 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (kind1 != kind)
11445 PyMem_Free(buf1);
11446 if (kind2 != kind)
11447 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011448
11449 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 onError:
11451 Py_DECREF(sep_obj);
11452 Py_DECREF(str_obj);
11453 if (kind1 != kind && buf1)
11454 PyMem_Free(buf1);
11455 if (kind2 != kind && buf2)
11456 PyMem_Free(buf2);
11457 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458}
11459
11460
11461PyObject *
11462PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11463{
11464 PyObject* str_obj;
11465 PyObject* sep_obj;
11466 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 int kind1, kind2, kind;
11468 void *buf1 = NULL, *buf2 = NULL;
11469 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011470
11471 str_obj = PyUnicode_FromObject(str_in);
11472 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011474 sep_obj = PyUnicode_FromObject(sep_in);
11475 if (!sep_obj) {
11476 Py_DECREF(str_obj);
11477 return NULL;
11478 }
11479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 kind1 = PyUnicode_KIND(str_in);
11481 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011482 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 buf1 = PyUnicode_DATA(str_in);
11484 if (kind1 != kind)
11485 buf1 = _PyUnicode_AsKind(str_in, kind);
11486 if (!buf1)
11487 goto onError;
11488 buf2 = PyUnicode_DATA(sep_obj);
11489 if (kind2 != kind)
11490 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11491 if (!buf2)
11492 goto onError;
11493 len1 = PyUnicode_GET_LENGTH(str_obj);
11494 len2 = PyUnicode_GET_LENGTH(sep_obj);
11495
11496 switch(PyUnicode_KIND(str_in)) {
11497 case PyUnicode_1BYTE_KIND:
11498 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11499 break;
11500 case PyUnicode_2BYTE_KIND:
11501 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11502 break;
11503 case PyUnicode_4BYTE_KIND:
11504 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11505 break;
11506 default:
11507 assert(0);
11508 out = 0;
11509 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011510
11511 Py_DECREF(sep_obj);
11512 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (kind1 != kind)
11514 PyMem_Free(buf1);
11515 if (kind2 != kind)
11516 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011517
11518 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 onError:
11520 Py_DECREF(sep_obj);
11521 Py_DECREF(str_obj);
11522 if (kind1 != kind && buf1)
11523 PyMem_Free(buf1);
11524 if (kind2 != kind && buf2)
11525 PyMem_Free(buf2);
11526 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011527}
11528
11529PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011531\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011532Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011533the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011534found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535
11536static PyObject*
11537unicode_partition(PyUnicodeObject *self, PyObject *separator)
11538{
11539 return PyUnicode_Partition((PyObject *)self, separator);
11540}
11541
11542PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011543 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011544\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011545Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011546the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011547separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011548
11549static PyObject*
11550unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11551{
11552 return PyUnicode_RPartition((PyObject *)self, separator);
11553}
11554
Alexander Belopolsky40018472011-02-26 01:02:56 +000011555PyObject *
11556PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011557{
11558 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011559
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011560 s = PyUnicode_FromObject(s);
11561 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011562 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 if (sep != NULL) {
11564 sep = PyUnicode_FromObject(sep);
11565 if (sep == NULL) {
11566 Py_DECREF(s);
11567 return NULL;
11568 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011569 }
11570
11571 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11572
11573 Py_DECREF(s);
11574 Py_XDECREF(sep);
11575 return result;
11576}
11577
11578PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011580\n\
11581Return a list of the words in S, using sep as the\n\
11582delimiter string, starting at the end of the string and\n\
11583working to the front. If maxsplit is given, at most maxsplit\n\
11584splits are done. If sep is not specified, any whitespace string\n\
11585is a separator.");
11586
11587static PyObject*
11588unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11589{
11590 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011591 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011592
Martin v. Löwis18e16552006-02-15 17:27:45 +000011593 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011594 return NULL;
11595
11596 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011598 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011600 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
11607Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011608Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011612unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011614 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011615 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011617 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11618 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 return NULL;
11620
Guido van Rossum86662912000-04-11 15:38:46 +000011621 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
11624static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011625PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626{
Walter Dörwald346737f2007-05-31 10:44:43 +000011627 if (PyUnicode_CheckExact(self)) {
11628 Py_INCREF(self);
11629 return self;
11630 } else
11631 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011632 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633}
11634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011635PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637\n\
11638Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011639and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640
11641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011642unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644 return fixup(self, fixswapcase);
11645}
11646
Georg Brandlceee0772007-11-27 23:48:05 +000011647PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011649\n\
11650Return a translation table usable for str.translate().\n\
11651If there is only one argument, it must be a dictionary mapping Unicode\n\
11652ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011653Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011654If there are two arguments, they must be strings of equal length, and\n\
11655in the resulting dictionary, each character in x will be mapped to the\n\
11656character at the same position in y. If there is a third argument, it\n\
11657must be a string, whose characters will be mapped to None in the result.");
11658
11659static PyObject*
11660unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11661{
11662 PyObject *x, *y = NULL, *z = NULL;
11663 PyObject *new = NULL, *key, *value;
11664 Py_ssize_t i = 0;
11665 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011666
Georg Brandlceee0772007-11-27 23:48:05 +000011667 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11668 return NULL;
11669 new = PyDict_New();
11670 if (!new)
11671 return NULL;
11672 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 int x_kind, y_kind, z_kind;
11674 void *x_data, *y_data, *z_data;
11675
Georg Brandlceee0772007-11-27 23:48:05 +000011676 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011677 if (!PyUnicode_Check(x)) {
11678 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11679 "be a string if there is a second argument");
11680 goto err;
11681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011683 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11684 "arguments must have equal length");
11685 goto err;
11686 }
11687 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 x_kind = PyUnicode_KIND(x);
11689 y_kind = PyUnicode_KIND(y);
11690 x_data = PyUnicode_DATA(x);
11691 y_data = PyUnicode_DATA(y);
11692 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11693 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11694 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011695 if (!key || !value)
11696 goto err;
11697 res = PyDict_SetItem(new, key, value);
11698 Py_DECREF(key);
11699 Py_DECREF(value);
11700 if (res < 0)
11701 goto err;
11702 }
11703 /* create entries for deleting chars in z */
11704 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 z_kind = PyUnicode_KIND(z);
11706 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011707 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011709 if (!key)
11710 goto err;
11711 res = PyDict_SetItem(new, key, Py_None);
11712 Py_DECREF(key);
11713 if (res < 0)
11714 goto err;
11715 }
11716 }
11717 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 int kind;
11719 void *data;
11720
Georg Brandlceee0772007-11-27 23:48:05 +000011721 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011722 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011723 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11724 "to maketrans it must be a dict");
11725 goto err;
11726 }
11727 /* copy entries into the new dict, converting string keys to int keys */
11728 while (PyDict_Next(x, &i, &key, &value)) {
11729 if (PyUnicode_Check(key)) {
11730 /* convert string keys to integer keys */
11731 PyObject *newkey;
11732 if (PyUnicode_GET_SIZE(key) != 1) {
11733 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11734 "table must be of length 1");
11735 goto err;
11736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 kind = PyUnicode_KIND(key);
11738 data = PyUnicode_DATA(key);
11739 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011740 if (!newkey)
11741 goto err;
11742 res = PyDict_SetItem(new, newkey, value);
11743 Py_DECREF(newkey);
11744 if (res < 0)
11745 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011746 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011747 /* just keep integer keys */
11748 if (PyDict_SetItem(new, key, value) < 0)
11749 goto err;
11750 } else {
11751 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11752 "be strings or integers");
11753 goto err;
11754 }
11755 }
11756 }
11757 return new;
11758 err:
11759 Py_DECREF(new);
11760 return NULL;
11761}
11762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765\n\
11766Return a copy of the string S, where all characters have been mapped\n\
11767through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011768Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011769Unmapped characters are left untouched. Characters mapped to None\n\
11770are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
11772static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776}
11777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011778PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
11783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011784unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 return fixup(self, fixupper);
11787}
11788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011789PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011790 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011792Pad a numeric string S with zeros on the left, to fill a field\n\
11793of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
11795static PyObject *
11796unicode_zfill(PyUnicodeObject *self, PyObject *args)
11797{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011798 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011800 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 int kind;
11802 void *data;
11803 Py_UCS4 chr;
11804
11805 if (PyUnicode_READY(self) == -1)
11806 return NULL;
11807
Martin v. Löwis18e16552006-02-15 17:27:45 +000011808 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 return NULL;
11810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011812 if (PyUnicode_CheckExact(self)) {
11813 Py_INCREF(self);
11814 return (PyObject*) self;
11815 }
11816 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011817 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818 }
11819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
11822 u = pad(self, fill, 0, '0');
11823
Walter Dörwald068325e2002-04-15 13:36:47 +000011824 if (u == NULL)
11825 return NULL;
11826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 kind = PyUnicode_KIND(u);
11828 data = PyUnicode_DATA(u);
11829 chr = PyUnicode_READ(kind, data, fill);
11830
11831 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 PyUnicode_WRITE(kind, data, 0, chr);
11834 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835 }
11836
11837 return (PyObject*) u;
11838}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011841static PyObject *
11842unicode__decimal2ascii(PyObject *self)
11843{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011845}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846#endif
11847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011851Return True if S starts with the specified prefix, False otherwise.\n\
11852With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011853With optional end, stop comparing S at that position.\n\
11854prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
11856static PyObject *
11857unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011860 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011862 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011863 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011864 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865
Jesus Ceaac451502011-04-20 17:09:23 +020011866 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011868 if (PyTuple_Check(subobj)) {
11869 Py_ssize_t i;
11870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011873 if (substring == NULL)
11874 return NULL;
11875 result = tailmatch(self, substring, start, end, -1);
11876 Py_DECREF(substring);
11877 if (result) {
11878 Py_RETURN_TRUE;
11879 }
11880 }
11881 /* nothing matched */
11882 Py_RETURN_FALSE;
11883 }
11884 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011885 if (substring == NULL) {
11886 if (PyErr_ExceptionMatches(PyExc_TypeError))
11887 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11888 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011890 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011891 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011893 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
11896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011897PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011900Return True if S ends with the specified suffix, False otherwise.\n\
11901With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011902With optional end, stop comparing S at that position.\n\
11903suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904
11905static PyObject *
11906unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011909 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011911 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011912 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011913 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Jesus Ceaac451502011-04-20 17:09:23 +020011915 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011917 if (PyTuple_Check(subobj)) {
11918 Py_ssize_t i;
11919 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11920 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011922 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011923 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011924 result = tailmatch(self, substring, start, end, +1);
11925 Py_DECREF(substring);
11926 if (result) {
11927 Py_RETURN_TRUE;
11928 }
11929 }
11930 Py_RETURN_FALSE;
11931 }
11932 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011933 if (substring == NULL) {
11934 if (PyErr_ExceptionMatches(PyExc_TypeError))
11935 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11936 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011938 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011939 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011941 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942}
11943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011945
11946PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011948\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011949Return a formatted version of S, using substitutions from args and kwargs.\n\
11950The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011951
Eric Smith27bbca62010-11-04 17:06:58 +000011952PyDoc_STRVAR(format_map__doc__,
11953 "S.format_map(mapping) -> str\n\
11954\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011955Return a formatted version of S, using substitutions from mapping.\n\
11956The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011957
Eric Smith4a7d76d2008-05-30 18:10:19 +000011958static PyObject *
11959unicode__format__(PyObject* self, PyObject* args)
11960{
11961 PyObject *format_spec;
11962
11963 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11964 return NULL;
11965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11967 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011968}
11969
Eric Smith8c663262007-08-25 02:26:07 +000011970PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011972\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011973Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011974
11975static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011976unicode__sizeof__(PyUnicodeObject *v)
11977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 Py_ssize_t size;
11979
11980 /* If it's a compact object, account for base structure +
11981 character data. */
11982 if (PyUnicode_IS_COMPACT_ASCII(v))
11983 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11984 else if (PyUnicode_IS_COMPACT(v))
11985 size = sizeof(PyCompactUnicodeObject) +
11986 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11987 else {
11988 /* If it is a two-block object, account for base object, and
11989 for character block if present. */
11990 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011991 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 size += (PyUnicode_GET_LENGTH(v) + 1) *
11993 PyUnicode_CHARACTER_SIZE(v);
11994 }
11995 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011996 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011998 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012000 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012001 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002
12003 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012004}
12005
12006PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012007 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012008
12009static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012010unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012011{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012012 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (!copy)
12014 return NULL;
12015 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012016}
12017
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018static PyMethodDef unicode_methods[] = {
12019
12020 /* Order is according to common usage: often used methods should
12021 appear first, since lookup is done sequentially. */
12022
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012023 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012024 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12025 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012026 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012027 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12028 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12029 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12030 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12031 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12032 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12033 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012034 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012035 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12036 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12037 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012038 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012039 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12040 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12041 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012042 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012043 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012044 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012045 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012046 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12047 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12048 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12049 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12050 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12051 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12052 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12053 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12054 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12055 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12056 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12057 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12058 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12059 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012060 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012061 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012062 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012063 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012064 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012065 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012066 {"maketrans", (PyCFunction) unicode_maketrans,
12067 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012068 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012069#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012070 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071#endif
12072
12073#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012074 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012075 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076#endif
12077
Benjamin Peterson14339b62009-01-31 16:36:08 +000012078 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 {NULL, NULL}
12080};
12081
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012082static PyObject *
12083unicode_mod(PyObject *v, PyObject *w)
12084{
Brian Curtindfc80e32011-08-10 20:28:54 -050012085 if (!PyUnicode_Check(v))
12086 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012087 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012088}
12089
12090static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 0, /*nb_add*/
12092 0, /*nb_subtract*/
12093 0, /*nb_multiply*/
12094 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012095};
12096
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012098 (lenfunc) unicode_length, /* sq_length */
12099 PyUnicode_Concat, /* sq_concat */
12100 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12101 (ssizeargfunc) unicode_getitem, /* sq_item */
12102 0, /* sq_slice */
12103 0, /* sq_ass_item */
12104 0, /* sq_ass_slice */
12105 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106};
12107
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012108static PyObject*
12109unicode_subscript(PyUnicodeObject* self, PyObject* item)
12110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (PyUnicode_READY(self) == -1)
12112 return NULL;
12113
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012114 if (PyIndex_Check(item)) {
12115 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012116 if (i == -1 && PyErr_Occurred())
12117 return NULL;
12118 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012120 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012121 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012122 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012123 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012124 Py_UNICODE* result_buf;
12125 PyObject* result;
12126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012129 return NULL;
12130 }
12131
12132 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 return PyUnicode_New(0, 0);
12134 } else if (start == 0 && step == 1 &&
12135 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012136 PyUnicode_CheckExact(self)) {
12137 Py_INCREF(self);
12138 return (PyObject *)self;
12139 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012140 return PyUnicode_Substring((PyObject*)self,
12141 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012142 } else {
12143 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012144 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12145 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 if (result_buf == NULL)
12148 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012149
12150 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12151 result_buf[i] = source_buf[cur];
12152 }
Tim Petersced69f82003-09-16 20:30:58 +000012153
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012154 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012155 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012156 return result;
12157 }
12158 } else {
12159 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12160 return NULL;
12161 }
12162}
12163
12164static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 (lenfunc)unicode_length, /* mp_length */
12166 (binaryfunc)unicode_subscript, /* mp_subscript */
12167 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012168};
12169
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171/* Helpers for PyUnicode_Format() */
12172
12173static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012174getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012176 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012178 (*p_argidx)++;
12179 if (arglen < 0)
12180 return args;
12181 else
12182 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 }
12184 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012185 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 return NULL;
12187}
12188
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012189/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012191static PyObject *
12192formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012194 char *p;
12195 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012197
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 x = PyFloat_AsDouble(v);
12199 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012200 return NULL;
12201
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012203 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012204
Eric Smith0923d1d2009-04-16 20:16:10 +000012205 p = PyOS_double_to_string(x, type, prec,
12206 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012207 if (p == NULL)
12208 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012210 PyMem_Free(p);
12211 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212}
12213
Tim Peters38fd5b62000-09-21 05:43:11 +000012214static PyObject*
12215formatlong(PyObject *val, int flags, int prec, int type)
12216{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 char *buf;
12218 int len;
12219 PyObject *str; /* temporary string object. */
12220 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012221
Benjamin Peterson14339b62009-01-31 16:36:08 +000012222 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12223 if (!str)
12224 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012226 Py_DECREF(str);
12227 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012228}
12229
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012232 size_t buflen,
12233 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012235 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012236 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_GET_LENGTH(v) == 1) {
12238 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 buf[1] = '\0';
12240 return 1;
12241 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012242 goto onError;
12243 }
12244 else {
12245 /* Integer input truncated to a character */
12246 long x;
12247 x = PyLong_AsLong(v);
12248 if (x == -1 && PyErr_Occurred())
12249 goto onError;
12250
12251 if (x < 0 || x > 0x10ffff) {
12252 PyErr_SetString(PyExc_OverflowError,
12253 "%c arg not in range(0x110000)");
12254 return -1;
12255 }
12256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012258 buf[1] = '\0';
12259 return 1;
12260 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012261
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012263 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012265 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266}
12267
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012268/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012269 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012270*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012271#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012272
Alexander Belopolsky40018472011-02-26 01:02:56 +000012273PyObject *
12274PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 void *fmt;
12277 int fmtkind;
12278 PyObject *result;
12279 Py_UCS4 *res, *res0;
12280 Py_UCS4 max;
12281 int kind;
12282 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012286
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 PyErr_BadInternalCall();
12289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12292 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 fmt = PyUnicode_DATA(uformat);
12295 fmtkind = PyUnicode_KIND(uformat);
12296 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12297 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298
12299 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12301 if (res0 == NULL) {
12302 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 arglen = PyTuple_Size(args);
12308 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 }
12310 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 arglen = -1;
12312 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012314 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012315 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317
12318 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 if (--rescnt < 0) {
12321 rescnt = fmtcnt + 100;
12322 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12324 if (res0 == NULL){
12325 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 }
12328 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012332 }
12333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 /* Got a format specifier */
12335 int flags = 0;
12336 Py_ssize_t width = -1;
12337 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 Py_UCS4 c = '\0';
12339 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 int isnumok;
12341 PyObject *v = NULL;
12342 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 void *pbuf;
12344 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012345 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 Py_ssize_t len, len1;
12347 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 fmtpos++;
12350 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12351 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 Py_ssize_t keylen;
12353 PyObject *key;
12354 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012355
Benjamin Peterson29060642009-01-31 22:14:21 +000012356 if (dict == NULL) {
12357 PyErr_SetString(PyExc_TypeError,
12358 "format requires a mapping");
12359 goto onError;
12360 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 /* Skip over balanced parentheses */
12365 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 if (fmtcnt < 0 || pcount > 0) {
12374 PyErr_SetString(PyExc_ValueError,
12375 "incomplete format key");
12376 goto onError;
12377 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012378 key = PyUnicode_Substring((PyObject*)uformat,
12379 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012380 if (key == NULL)
12381 goto onError;
12382 if (args_owned) {
12383 Py_DECREF(args);
12384 args_owned = 0;
12385 }
12386 args = PyObject_GetItem(dict, key);
12387 Py_DECREF(key);
12388 if (args == NULL) {
12389 goto onError;
12390 }
12391 args_owned = 1;
12392 arglen = -1;
12393 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 case '-': flags |= F_LJUST; continue;
12398 case '+': flags |= F_SIGN; continue;
12399 case ' ': flags |= F_BLANK; continue;
12400 case '#': flags |= F_ALT; continue;
12401 case '0': flags |= F_ZERO; continue;
12402 }
12403 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 if (c == '*') {
12406 v = getnextarg(args, arglen, &argidx);
12407 if (v == NULL)
12408 goto onError;
12409 if (!PyLong_Check(v)) {
12410 PyErr_SetString(PyExc_TypeError,
12411 "* wants int");
12412 goto onError;
12413 }
12414 width = PyLong_AsLong(v);
12415 if (width == -1 && PyErr_Occurred())
12416 goto onError;
12417 if (width < 0) {
12418 flags |= F_LJUST;
12419 width = -width;
12420 }
12421 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012423 }
12424 else if (c >= '0' && c <= '9') {
12425 width = c - '0';
12426 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012428 if (c < '0' || c > '9')
12429 break;
12430 if ((width*10) / 10 != width) {
12431 PyErr_SetString(PyExc_ValueError,
12432 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012433 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 }
12435 width = width*10 + (c - '0');
12436 }
12437 }
12438 if (c == '.') {
12439 prec = 0;
12440 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 if (c == '*') {
12443 v = getnextarg(args, arglen, &argidx);
12444 if (v == NULL)
12445 goto onError;
12446 if (!PyLong_Check(v)) {
12447 PyErr_SetString(PyExc_TypeError,
12448 "* wants int");
12449 goto onError;
12450 }
12451 prec = PyLong_AsLong(v);
12452 if (prec == -1 && PyErr_Occurred())
12453 goto onError;
12454 if (prec < 0)
12455 prec = 0;
12456 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 }
12459 else if (c >= '0' && c <= '9') {
12460 prec = c - '0';
12461 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 if (c < '0' || c > '9')
12464 break;
12465 if ((prec*10) / 10 != prec) {
12466 PyErr_SetString(PyExc_ValueError,
12467 "prec too big");
12468 goto onError;
12469 }
12470 prec = prec*10 + (c - '0');
12471 }
12472 }
12473 } /* prec */
12474 if (fmtcnt >= 0) {
12475 if (c == 'h' || c == 'l' || c == 'L') {
12476 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 }
12479 }
12480 if (fmtcnt < 0) {
12481 PyErr_SetString(PyExc_ValueError,
12482 "incomplete format");
12483 goto onError;
12484 }
12485 if (c != '%') {
12486 v = getnextarg(args, arglen, &argidx);
12487 if (v == NULL)
12488 goto onError;
12489 }
12490 sign = 0;
12491 fill = ' ';
12492 switch (c) {
12493
12494 case '%':
12495 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 len = 1;
12500 break;
12501
12502 case 's':
12503 case 'r':
12504 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012505 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012506 temp = v;
12507 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 }
12509 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 if (c == 's')
12511 temp = PyObject_Str(v);
12512 else if (c == 'r')
12513 temp = PyObject_Repr(v);
12514 else
12515 temp = PyObject_ASCII(v);
12516 if (temp == NULL)
12517 goto onError;
12518 if (PyUnicode_Check(temp))
12519 /* nothing to do */;
12520 else {
12521 Py_DECREF(temp);
12522 PyErr_SetString(PyExc_TypeError,
12523 "%s argument has non-string str()");
12524 goto onError;
12525 }
12526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (PyUnicode_READY(temp) == -1) {
12528 Py_CLEAR(temp);
12529 goto onError;
12530 }
12531 pbuf = PyUnicode_DATA(temp);
12532 kind = PyUnicode_KIND(temp);
12533 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012534 if (prec >= 0 && len > prec)
12535 len = prec;
12536 break;
12537
12538 case 'i':
12539 case 'd':
12540 case 'u':
12541 case 'o':
12542 case 'x':
12543 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 isnumok = 0;
12545 if (PyNumber_Check(v)) {
12546 PyObject *iobj=NULL;
12547
12548 if (PyLong_Check(v)) {
12549 iobj = v;
12550 Py_INCREF(iobj);
12551 }
12552 else {
12553 iobj = PyNumber_Long(v);
12554 }
12555 if (iobj!=NULL) {
12556 if (PyLong_Check(iobj)) {
12557 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012558 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012559 Py_DECREF(iobj);
12560 if (!temp)
12561 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 if (PyUnicode_READY(temp) == -1) {
12563 Py_CLEAR(temp);
12564 goto onError;
12565 }
12566 pbuf = PyUnicode_DATA(temp);
12567 kind = PyUnicode_KIND(temp);
12568 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 sign = 1;
12570 }
12571 else {
12572 Py_DECREF(iobj);
12573 }
12574 }
12575 }
12576 if (!isnumok) {
12577 PyErr_Format(PyExc_TypeError,
12578 "%%%c format: a number is required, "
12579 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12580 goto onError;
12581 }
12582 if (flags & F_ZERO)
12583 fill = '0';
12584 break;
12585
12586 case 'e':
12587 case 'E':
12588 case 'f':
12589 case 'F':
12590 case 'g':
12591 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012592 temp = formatfloat(v, flags, prec, c);
12593 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 if (PyUnicode_READY(temp) == -1) {
12596 Py_CLEAR(temp);
12597 goto onError;
12598 }
12599 pbuf = PyUnicode_DATA(temp);
12600 kind = PyUnicode_KIND(temp);
12601 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 sign = 1;
12603 if (flags & F_ZERO)
12604 fill = '0';
12605 break;
12606
12607 case 'c':
12608 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012610 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012611 if (len < 0)
12612 goto onError;
12613 break;
12614
12615 default:
12616 PyErr_Format(PyExc_ValueError,
12617 "unsupported format character '%c' (0x%x) "
12618 "at index %zd",
12619 (31<=c && c<=126) ? (char)c : '?',
12620 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 goto onError;
12623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 /* pbuf is initialized here. */
12625 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012626 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12628 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12629 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 len--;
12631 }
12632 else if (flags & F_SIGN)
12633 sign = '+';
12634 else if (flags & F_BLANK)
12635 sign = ' ';
12636 else
12637 sign = 0;
12638 }
12639 if (width < len)
12640 width = len;
12641 if (rescnt - (sign != 0) < width) {
12642 reslen -= rescnt;
12643 rescnt = width + fmtcnt + 100;
12644 reslen += rescnt;
12645 if (reslen < 0) {
12646 Py_XDECREF(temp);
12647 PyErr_NoMemory();
12648 goto onError;
12649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12651 if (res0 == 0) {
12652 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012653 Py_XDECREF(temp);
12654 goto onError;
12655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012657 }
12658 if (sign) {
12659 if (fill != ' ')
12660 *res++ = sign;
12661 rescnt--;
12662 if (width > len)
12663 width--;
12664 }
12665 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12667 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012668 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12670 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012671 }
12672 rescnt -= 2;
12673 width -= 2;
12674 if (width < 0)
12675 width = 0;
12676 len -= 2;
12677 }
12678 if (width > len && !(flags & F_LJUST)) {
12679 do {
12680 --rescnt;
12681 *res++ = fill;
12682 } while (--width > len);
12683 }
12684 if (fill == ' ') {
12685 if (sign)
12686 *res++ = sign;
12687 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12689 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12690 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12691 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012692 }
12693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 /* Copy all characters, preserving len */
12695 len1 = len;
12696 while (len1--) {
12697 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12698 rescnt--;
12699 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 while (--width >= len) {
12701 --rescnt;
12702 *res++ = ' ';
12703 }
12704 if (dict && (argidx < arglen) && c != '%') {
12705 PyErr_SetString(PyExc_TypeError,
12706 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012707 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 goto onError;
12709 }
12710 Py_XDECREF(temp);
12711 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712 } /* until end */
12713 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 PyErr_SetString(PyExc_TypeError,
12715 "not all arguments converted during string formatting");
12716 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717 }
12718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719
12720 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12721 if (*res > max)
12722 max = *res;
12723 result = PyUnicode_New(reslen - rescnt, max);
12724 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 kind = PyUnicode_KIND(result);
12727 for (res = res0; res < res0+reslen-rescnt; res++)
12728 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12729 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 }
12733 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734 return (PyObject *)result;
12735
Benjamin Peterson29060642009-01-31 22:14:21 +000012736 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738 Py_DECREF(uformat);
12739 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741 }
12742 return NULL;
12743}
12744
Jeremy Hylton938ace62002-07-17 16:30:39 +000012745static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012746unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12747
Tim Peters6d6c1a32001-08-02 04:15:00 +000012748static PyObject *
12749unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12750{
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012752 static char *kwlist[] = {"object", "encoding", "errors", 0};
12753 char *encoding = NULL;
12754 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012755
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 if (type != &PyUnicode_Type)
12757 return unicode_subtype_new(type, args, kwds);
12758 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012760 return NULL;
12761 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012762 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012763 if (encoding == NULL && errors == NULL)
12764 return PyObject_Str(x);
12765 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012767}
12768
Guido van Rossume023fe02001-08-30 03:12:59 +000012769static PyObject *
12770unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12771{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012772 PyUnicodeObject *unicode, *self;
12773 Py_ssize_t length, char_size;
12774 int share_wstr, share_utf8;
12775 unsigned int kind;
12776 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012777
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012779
12780 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12781 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012782 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012783 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012784 if (PyUnicode_READY(unicode))
12785 return NULL;
12786
12787 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12788 if (self == NULL) {
12789 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012790 return NULL;
12791 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012792 kind = PyUnicode_KIND(unicode);
12793 length = PyUnicode_GET_LENGTH(unicode);
12794
12795 _PyUnicode_LENGTH(self) = length;
12796 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12797 _PyUnicode_STATE(self).interned = 0;
12798 _PyUnicode_STATE(self).kind = kind;
12799 _PyUnicode_STATE(self).compact = 0;
12800 _PyUnicode_STATE(self).ascii = 0;
12801 _PyUnicode_STATE(self).ready = 1;
12802 _PyUnicode_WSTR(self) = NULL;
12803 _PyUnicode_UTF8_LENGTH(self) = 0;
12804 _PyUnicode_UTF8(self) = NULL;
12805 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012806 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012807
12808 share_utf8 = 0;
12809 share_wstr = 0;
12810 if (kind == PyUnicode_1BYTE_KIND) {
12811 char_size = 1;
12812 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12813 share_utf8 = 1;
12814 }
12815 else if (kind == PyUnicode_2BYTE_KIND) {
12816 char_size = 2;
12817 if (sizeof(wchar_t) == 2)
12818 share_wstr = 1;
12819 }
12820 else {
12821 assert(kind == PyUnicode_4BYTE_KIND);
12822 char_size = 4;
12823 if (sizeof(wchar_t) == 4)
12824 share_wstr = 1;
12825 }
12826
12827 /* Ensure we won't overflow the length. */
12828 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12829 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012831 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012832 data = PyObject_MALLOC((length + 1) * char_size);
12833 if (data == NULL) {
12834 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835 goto onError;
12836 }
12837
Victor Stinnerc3c74152011-10-02 20:39:55 +020012838 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012839 if (share_utf8) {
12840 _PyUnicode_UTF8_LENGTH(self) = length;
12841 _PyUnicode_UTF8(self) = data;
12842 }
12843 if (share_wstr) {
12844 _PyUnicode_WSTR_LENGTH(self) = length;
12845 _PyUnicode_WSTR(self) = (wchar_t *)data;
12846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012848 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12849 PyUnicode_KIND_SIZE(kind, length + 1));
12850 Py_DECREF(unicode);
12851 return (PyObject *)self;
12852
12853onError:
12854 Py_DECREF(unicode);
12855 Py_DECREF(self);
12856 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012857}
12858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012859PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012861\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012862Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012863encoding defaults to the current default string encoding.\n\
12864errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012865
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012866static PyObject *unicode_iter(PyObject *seq);
12867
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012869 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012870 "str", /* tp_name */
12871 sizeof(PyUnicodeObject), /* tp_size */
12872 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012874 (destructor)unicode_dealloc, /* tp_dealloc */
12875 0, /* tp_print */
12876 0, /* tp_getattr */
12877 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012878 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 unicode_repr, /* tp_repr */
12880 &unicode_as_number, /* tp_as_number */
12881 &unicode_as_sequence, /* tp_as_sequence */
12882 &unicode_as_mapping, /* tp_as_mapping */
12883 (hashfunc) unicode_hash, /* tp_hash*/
12884 0, /* tp_call*/
12885 (reprfunc) unicode_str, /* tp_str */
12886 PyObject_GenericGetAttr, /* tp_getattro */
12887 0, /* tp_setattro */
12888 0, /* tp_as_buffer */
12889 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 unicode_doc, /* tp_doc */
12892 0, /* tp_traverse */
12893 0, /* tp_clear */
12894 PyUnicode_RichCompare, /* tp_richcompare */
12895 0, /* tp_weaklistoffset */
12896 unicode_iter, /* tp_iter */
12897 0, /* tp_iternext */
12898 unicode_methods, /* tp_methods */
12899 0, /* tp_members */
12900 0, /* tp_getset */
12901 &PyBaseObject_Type, /* tp_base */
12902 0, /* tp_dict */
12903 0, /* tp_descr_get */
12904 0, /* tp_descr_set */
12905 0, /* tp_dictoffset */
12906 0, /* tp_init */
12907 0, /* tp_alloc */
12908 unicode_new, /* tp_new */
12909 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910};
12911
12912/* Initialize the Unicode implementation */
12913
Thomas Wouters78890102000-07-22 19:25:51 +000012914void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012916 int i;
12917
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920 0x000A, /* LINE FEED */
12921 0x000D, /* CARRIAGE RETURN */
12922 0x001C, /* FILE SEPARATOR */
12923 0x001D, /* GROUP SEPARATOR */
12924 0x001E, /* RECORD SEPARATOR */
12925 0x0085, /* NEXT LINE */
12926 0x2028, /* LINE SEPARATOR */
12927 0x2029, /* PARAGRAPH SEPARATOR */
12928 };
12929
Fred Drakee4315f52000-05-09 19:53:39 +000012930 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012931 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012932 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012934
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012935 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012937 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939
12940 /* initialize the linebreak bloom filter */
12941 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012943 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012944
12945 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946}
12947
12948/* Finalize the Unicode implementation */
12949
Christian Heimesa156e092008-02-16 07:38:31 +000012950int
12951PyUnicode_ClearFreeList(void)
12952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012954}
12955
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956void
Thomas Wouters78890102000-07-22 19:25:51 +000012957_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012959 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012960
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012961 Py_XDECREF(unicode_empty);
12962 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012963
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012964 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 if (unicode_latin1[i]) {
12966 Py_DECREF(unicode_latin1[i]);
12967 unicode_latin1[i] = NULL;
12968 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012969 }
Christian Heimesa156e092008-02-16 07:38:31 +000012970 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012972
Walter Dörwald16807132007-05-25 13:52:07 +000012973void
12974PyUnicode_InternInPlace(PyObject **p)
12975{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12977 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012978#ifdef Py_DEBUG
12979 assert(s != NULL);
12980 assert(_PyUnicode_CHECK(s));
12981#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012982 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012983 return;
12984#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012985 /* If it's a subclass, we don't really know what putting
12986 it in the interned dict might do. */
12987 if (!PyUnicode_CheckExact(s))
12988 return;
12989 if (PyUnicode_CHECK_INTERNED(s))
12990 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012992 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 return;
12994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012995 if (interned == NULL) {
12996 interned = PyDict_New();
12997 if (interned == NULL) {
12998 PyErr_Clear(); /* Don't leave an exception */
12999 return;
13000 }
13001 }
13002 /* It might be that the GetItem call fails even
13003 though the key is present in the dictionary,
13004 namely when this happens during a stack overflow. */
13005 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013008
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 if (t) {
13010 Py_INCREF(t);
13011 Py_DECREF(*p);
13012 *p = t;
13013 return;
13014 }
Walter Dörwald16807132007-05-25 13:52:07 +000013015
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 PyThreadState_GET()->recursion_critical = 1;
13017 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13018 PyErr_Clear();
13019 PyThreadState_GET()->recursion_critical = 0;
13020 return;
13021 }
13022 PyThreadState_GET()->recursion_critical = 0;
13023 /* The two references in interned are not counted by refcnt.
13024 The deallocator will take care of this */
13025 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013027}
13028
13029void
13030PyUnicode_InternImmortal(PyObject **p)
13031{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13033
Benjamin Peterson14339b62009-01-31 16:36:08 +000013034 PyUnicode_InternInPlace(p);
13035 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013037 Py_INCREF(*p);
13038 }
Walter Dörwald16807132007-05-25 13:52:07 +000013039}
13040
13041PyObject *
13042PyUnicode_InternFromString(const char *cp)
13043{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013044 PyObject *s = PyUnicode_FromString(cp);
13045 if (s == NULL)
13046 return NULL;
13047 PyUnicode_InternInPlace(&s);
13048 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013049}
13050
Alexander Belopolsky40018472011-02-26 01:02:56 +000013051void
13052_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013053{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013054 PyObject *keys;
13055 PyUnicodeObject *s;
13056 Py_ssize_t i, n;
13057 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013058
Benjamin Peterson14339b62009-01-31 16:36:08 +000013059 if (interned == NULL || !PyDict_Check(interned))
13060 return;
13061 keys = PyDict_Keys(interned);
13062 if (keys == NULL || !PyList_Check(keys)) {
13063 PyErr_Clear();
13064 return;
13065 }
Walter Dörwald16807132007-05-25 13:52:07 +000013066
Benjamin Peterson14339b62009-01-31 16:36:08 +000013067 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13068 detector, interned unicode strings are not forcibly deallocated;
13069 rather, we give them their stolen references back, and then clear
13070 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013071
Benjamin Peterson14339b62009-01-31 16:36:08 +000013072 n = PyList_GET_SIZE(keys);
13073 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013075 for (i = 0; i < n; i++) {
13076 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 if (PyUnicode_READY(s) == -1)
13078 fprintf(stderr, "could not ready string\n");
13079 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 case SSTATE_NOT_INTERNED:
13081 /* XXX Shouldn't happen */
13082 break;
13083 case SSTATE_INTERNED_IMMORTAL:
13084 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013086 break;
13087 case SSTATE_INTERNED_MORTAL:
13088 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013089 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013090 break;
13091 default:
13092 Py_FatalError("Inconsistent interned string state.");
13093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013095 }
13096 fprintf(stderr, "total size of all interned strings: "
13097 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13098 "mortal/immortal\n", mortal_size, immortal_size);
13099 Py_DECREF(keys);
13100 PyDict_Clear(interned);
13101 Py_DECREF(interned);
13102 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013103}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013104
13105
13106/********************* Unicode Iterator **************************/
13107
13108typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 PyObject_HEAD
13110 Py_ssize_t it_index;
13111 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013112} unicodeiterobject;
13113
13114static void
13115unicodeiter_dealloc(unicodeiterobject *it)
13116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013117 _PyObject_GC_UNTRACK(it);
13118 Py_XDECREF(it->it_seq);
13119 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013120}
13121
13122static int
13123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013125 Py_VISIT(it->it_seq);
13126 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013127}
13128
13129static PyObject *
13130unicodeiter_next(unicodeiterobject *it)
13131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013132 PyUnicodeObject *seq;
13133 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013134
Benjamin Peterson14339b62009-01-31 16:36:08 +000013135 assert(it != NULL);
13136 seq = it->it_seq;
13137 if (seq == NULL)
13138 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013139 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13142 int kind = PyUnicode_KIND(seq);
13143 void *data = PyUnicode_DATA(seq);
13144 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13145 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 if (item != NULL)
13147 ++it->it_index;
13148 return item;
13149 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013150
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 Py_DECREF(seq);
13152 it->it_seq = NULL;
13153 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013154}
13155
13156static PyObject *
13157unicodeiter_len(unicodeiterobject *it)
13158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 Py_ssize_t len = 0;
13160 if (it->it_seq)
13161 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13162 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013163}
13164
13165PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13166
13167static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013171};
13172
13173PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13175 "str_iterator", /* tp_name */
13176 sizeof(unicodeiterobject), /* tp_basicsize */
13177 0, /* tp_itemsize */
13178 /* methods */
13179 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13180 0, /* tp_print */
13181 0, /* tp_getattr */
13182 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013183 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013184 0, /* tp_repr */
13185 0, /* tp_as_number */
13186 0, /* tp_as_sequence */
13187 0, /* tp_as_mapping */
13188 0, /* tp_hash */
13189 0, /* tp_call */
13190 0, /* tp_str */
13191 PyObject_GenericGetAttr, /* tp_getattro */
13192 0, /* tp_setattro */
13193 0, /* tp_as_buffer */
13194 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13195 0, /* tp_doc */
13196 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13197 0, /* tp_clear */
13198 0, /* tp_richcompare */
13199 0, /* tp_weaklistoffset */
13200 PyObject_SelfIter, /* tp_iter */
13201 (iternextfunc)unicodeiter_next, /* tp_iternext */
13202 unicodeiter_methods, /* tp_methods */
13203 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013204};
13205
13206static PyObject *
13207unicode_iter(PyObject *seq)
13208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013210
Benjamin Peterson14339b62009-01-31 16:36:08 +000013211 if (!PyUnicode_Check(seq)) {
13212 PyErr_BadInternalCall();
13213 return NULL;
13214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 if (PyUnicode_READY(seq) == -1)
13216 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013217 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13218 if (it == NULL)
13219 return NULL;
13220 it->it_index = 0;
13221 Py_INCREF(seq);
13222 it->it_seq = (PyUnicodeObject *)seq;
13223 _PyObject_GC_TRACK(it);
13224 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013225}
13226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227#define UNIOP(x) Py_UNICODE_##x
13228#define UNIOP_t Py_UNICODE
13229#include "uniops.h"
13230#undef UNIOP
13231#undef UNIOP_t
13232#define UNIOP(x) Py_UCS4_##x
13233#define UNIOP_t Py_UCS4
13234#include "uniops.h"
13235#undef UNIOP
13236#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013237
Victor Stinner71133ff2010-09-01 23:43:53 +000013238Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013239PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013240{
13241 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13242 Py_UNICODE *copy;
13243 Py_ssize_t size;
13244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 if (!PyUnicode_Check(unicode)) {
13246 PyErr_BadArgument();
13247 return NULL;
13248 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013249 /* Ensure we won't overflow the size. */
13250 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13251 PyErr_NoMemory();
13252 return NULL;
13253 }
13254 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13255 size *= sizeof(Py_UNICODE);
13256 copy = PyMem_Malloc(size);
13257 if (copy == NULL) {
13258 PyErr_NoMemory();
13259 return NULL;
13260 }
13261 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13262 return copy;
13263}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013264
Georg Brandl66c221e2010-10-14 07:04:07 +000013265/* A _string module, to export formatter_parser and formatter_field_name_split
13266 to the string.Formatter class implemented in Python. */
13267
13268static PyMethodDef _string_methods[] = {
13269 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13270 METH_O, PyDoc_STR("split the argument as a field name")},
13271 {"formatter_parser", (PyCFunction) formatter_parser,
13272 METH_O, PyDoc_STR("parse the argument as a format string")},
13273 {NULL, NULL}
13274};
13275
13276static struct PyModuleDef _string_module = {
13277 PyModuleDef_HEAD_INIT,
13278 "_string",
13279 PyDoc_STR("string helper module"),
13280 0,
13281 _string_methods,
13282 NULL,
13283 NULL,
13284 NULL,
13285 NULL
13286};
13287
13288PyMODINIT_FUNC
13289PyInit__string(void)
13290{
13291 return PyModule_Create(&_string_module);
13292}
13293
13294
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013295#ifdef __cplusplus
13296}
13297#endif