blob: e8b19cfbf56e5a7c080dca1e88808d9e49da922e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200446 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200453 share_utf8 = (_PyUnicode_UTF8(unicode) == data);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200454
455 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
456 PyErr_NoMemory();
457 return -1;
458 }
459 new_size = (length + 1) * char_size;
460
461 data = (PyObject *)PyObject_REALLOC(data, new_size);
462 if (data == NULL) {
463 PyErr_NoMemory();
464 return -1;
465 }
466 _PyUnicode_DATA_ANY(unicode) = data;
467 if (share_wstr)
468 _PyUnicode_WSTR(unicode) = data;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200469 if (share_utf8)
470 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200471 _PyUnicode_LENGTH(unicode) = length;
472 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
473 if (share_wstr)
474 return 0;
475 }
476 if (_PyUnicode_WSTR(unicode) != NULL) {
477 assert(_PyUnicode_WSTR(unicode) != NULL);
478
479 oldstr = _PyUnicode_WSTR(unicode);
480 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
481 sizeof(Py_UNICODE) * (length + 1));
482 if (!_PyUnicode_WSTR(unicode)) {
483 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
484 PyErr_NoMemory();
485 return -1;
486 }
487 _PyUnicode_WSTR(unicode)[length] = 0;
488 _PyUnicode_WSTR_LENGTH(unicode) = length;
489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000490 return 0;
491}
492
Victor Stinnerfe226c02011-10-03 03:52:20 +0200493static PyObject*
494resize_copy(PyObject *unicode, Py_ssize_t length)
495{
496 Py_ssize_t copy_length;
497 if (PyUnicode_IS_COMPACT(unicode)) {
498 PyObject *copy;
499 assert(PyUnicode_IS_READY(unicode));
500
501 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
502 if (copy == NULL)
503 return NULL;
504
505 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
506 if (PyUnicode_CopyCharacters(copy, 0,
507 unicode, 0,
508 copy_length) < 0)
509 {
510 Py_DECREF(copy);
511 return NULL;
512 }
513 return copy;
514 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 assert(_PyUnicode_WSTR(unicode) != NULL);
517 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200518 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200519 if (w == NULL)
520 return NULL;
521 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
522 copy_length = Py_MIN(copy_length, length);
523 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
524 copy_length);
525 return (PyObject*)w;
526 }
527}
528
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000530 Ux0000 terminated; some code (e.g. new_identifier)
531 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000534 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535
536*/
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#ifdef Py_DEBUG
539int unicode_old_new_calls = 0;
540#endif
541
Alexander Belopolsky40018472011-02-26 01:02:56 +0000542static PyUnicodeObject *
543_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544{
545 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200546 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 if (length == 0 && unicode_empty != NULL) {
550 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200551 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552 }
553
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000554 /* Ensure we won't overflow the size. */
555 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
556 return (PyUnicodeObject *)PyErr_NoMemory();
557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 if (length < 0) {
559 PyErr_SetString(PyExc_SystemError,
560 "Negative size passed to _PyUnicode_New");
561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000562 }
563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564#ifdef Py_DEBUG
565 ++unicode_old_new_calls;
566#endif
567
568 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
569 if (unicode == NULL)
570 return NULL;
571 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
572 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
573 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000574 PyErr_NoMemory();
575 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200577
Jeremy Hyltond8082792003-09-16 19:41:39 +0000578 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000579 * the caller fails before initializing str -- unicode_resize()
580 * reads str[0], and the Keep-Alive optimization can keep memory
581 * allocated for str alive across a call to unicode_dealloc(unicode).
582 * We don't want unicode_resize to read uninitialized memory in
583 * that case.
584 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200585 _PyUnicode_WSTR(unicode)[0] = 0;
586 _PyUnicode_WSTR(unicode)[length] = 0;
587 _PyUnicode_WSTR_LENGTH(unicode) = length;
588 _PyUnicode_HASH(unicode) = -1;
589 _PyUnicode_STATE(unicode).interned = 0;
590 _PyUnicode_STATE(unicode).kind = 0;
591 _PyUnicode_STATE(unicode).compact = 0;
592 _PyUnicode_STATE(unicode).ready = 0;
593 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200594 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200595 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200596 _PyUnicode_UTF8(unicode) = NULL;
597 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000599
Benjamin Peterson29060642009-01-31 22:14:21 +0000600 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000601 /* XXX UNREF/NEWREF interface should be more symmetrical */
602 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000603 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000604 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000606}
607
Victor Stinnerf42dc442011-10-02 23:33:16 +0200608static const char*
609unicode_kind_name(PyObject *unicode)
610{
Victor Stinner910337b2011-10-03 03:20:16 +0200611 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200612 if (!PyUnicode_IS_COMPACT(unicode))
613 {
614 if (!PyUnicode_IS_READY(unicode))
615 return "wstr";
616 switch(PyUnicode_KIND(unicode))
617 {
618 case PyUnicode_1BYTE_KIND:
619 if (PyUnicode_IS_COMPACT_ASCII(unicode))
620 return "legacy ascii";
621 else
622 return "legacy latin1";
623 case PyUnicode_2BYTE_KIND:
624 return "legacy UCS2";
625 case PyUnicode_4BYTE_KIND:
626 return "legacy UCS4";
627 default:
628 return "<legacy invalid kind>";
629 }
630 }
631 assert(PyUnicode_IS_READY(unicode));
632 switch(PyUnicode_KIND(unicode))
633 {
634 case PyUnicode_1BYTE_KIND:
635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 return "ascii";
637 else
638 return "compact latin1";
639 case PyUnicode_2BYTE_KIND:
640 return "compact UCS2";
641 case PyUnicode_4BYTE_KIND:
642 return "compact UCS4";
643 default:
644 return "<invalid compact kind>";
645 }
646}
647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200648#ifdef Py_DEBUG
649int unicode_new_new_calls = 0;
650
651/* Functions wrapping macros for use in debugger */
652char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200653 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200654}
655
656void *_PyUnicode_compact_data(void *unicode) {
657 return _PyUnicode_COMPACT_DATA(unicode);
658}
659void *_PyUnicode_data(void *unicode){
660 printf("obj %p\n", unicode);
661 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
662 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
663 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
664 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
665 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
666 return PyUnicode_DATA(unicode);
667}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200668
669void
670_PyUnicode_Dump(PyObject *op)
671{
672 PyASCIIObject *ascii = (PyASCIIObject *)op;
673 printf("%s: len=%zu, wstr=%p",
674 unicode_kind_name(op),
675 ascii->length,
676 ascii->wstr);
677 if (!ascii->state.ascii) {
678 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
679 printf(" (%zu), utf8=%p (%zu)",
680 compact->wstr_length,
681 compact->utf8,
682 compact->utf8_length);
683 }
684 if (!ascii->state.compact) {
685 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
686 printf(", data=%p",
687 unicode->data.any);
688 }
689 printf("\n");
690}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691#endif
692
693PyObject *
694PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
695{
696 PyObject *obj;
697 PyCompactUnicodeObject *unicode;
698 void *data;
699 int kind_state;
700 int is_sharing = 0, is_ascii = 0;
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703
704 /* Optimization for empty strings */
705 if (size == 0 && unicode_empty != NULL) {
706 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200707 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 }
709
710#ifdef Py_DEBUG
711 ++unicode_new_new_calls;
712#endif
713
714 struct_size = sizeof(PyCompactUnicodeObject);
715 if (maxchar < 128) {
716 kind_state = PyUnicode_1BYTE_KIND;
717 char_size = 1;
718 is_ascii = 1;
719 struct_size = sizeof(PyASCIIObject);
720 }
721 else if (maxchar < 256) {
722 kind_state = PyUnicode_1BYTE_KIND;
723 char_size = 1;
724 }
725 else if (maxchar < 65536) {
726 kind_state = PyUnicode_2BYTE_KIND;
727 char_size = 2;
728 if (sizeof(wchar_t) == 2)
729 is_sharing = 1;
730 }
731 else {
732 kind_state = PyUnicode_4BYTE_KIND;
733 char_size = 4;
734 if (sizeof(wchar_t) == 4)
735 is_sharing = 1;
736 }
737
738 /* Ensure we won't overflow the size. */
739 if (size < 0) {
740 PyErr_SetString(PyExc_SystemError,
741 "Negative size passed to PyUnicode_New");
742 return NULL;
743 }
744 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
745 return PyErr_NoMemory();
746
747 /* Duplicated allocation code from _PyObject_New() instead of a call to
748 * PyObject_New() so we are able to allocate space for the object and
749 * it's data buffer.
750 */
751 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
752 if (obj == NULL)
753 return PyErr_NoMemory();
754 obj = PyObject_INIT(obj, &PyUnicode_Type);
755 if (obj == NULL)
756 return NULL;
757
758 unicode = (PyCompactUnicodeObject *)obj;
759 if (is_ascii)
760 data = ((PyASCIIObject*)obj) + 1;
761 else
762 data = unicode + 1;
763 _PyUnicode_LENGTH(unicode) = size;
764 _PyUnicode_HASH(unicode) = -1;
765 _PyUnicode_STATE(unicode).interned = 0;
766 _PyUnicode_STATE(unicode).kind = kind_state;
767 _PyUnicode_STATE(unicode).compact = 1;
768 _PyUnicode_STATE(unicode).ready = 1;
769 _PyUnicode_STATE(unicode).ascii = is_ascii;
770 if (is_ascii) {
771 ((char*)data)[size] = 0;
772 _PyUnicode_WSTR(unicode) = NULL;
773 }
774 else if (kind_state == PyUnicode_1BYTE_KIND) {
775 ((char*)data)[size] = 0;
776 _PyUnicode_WSTR(unicode) = NULL;
777 _PyUnicode_WSTR_LENGTH(unicode) = 0;
778 unicode->utf8_length = 0;
779 unicode->utf8 = NULL;
780 }
781 else {
782 unicode->utf8 = NULL;
783 if (kind_state == PyUnicode_2BYTE_KIND)
784 ((Py_UCS2*)data)[size] = 0;
785 else /* kind_state == PyUnicode_4BYTE_KIND */
786 ((Py_UCS4*)data)[size] = 0;
787 if (is_sharing) {
788 _PyUnicode_WSTR_LENGTH(unicode) = size;
789 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
790 }
791 else {
792 _PyUnicode_WSTR_LENGTH(unicode) = 0;
793 _PyUnicode_WSTR(unicode) = NULL;
794 }
795 }
796 return obj;
797}
798
799#if SIZEOF_WCHAR_T == 2
800/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
801 will decode surrogate pairs, the other conversions are implemented as macros
802 for efficency.
803
804 This function assumes that unicode can hold one more code point than wstr
805 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200806static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
808 PyUnicodeObject *unicode)
809{
810 const wchar_t *iter;
811 Py_UCS4 *ucs4_out;
812
Victor Stinner910337b2011-10-03 03:20:16 +0200813 assert(unicode != NULL);
814 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200815 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
816 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
817
818 for (iter = begin; iter < end; ) {
819 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
820 _PyUnicode_GET_LENGTH(unicode)));
821 if (*iter >= 0xD800 && *iter <= 0xDBFF
822 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
823 {
824 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
825 iter += 2;
826 }
827 else {
828 *ucs4_out++ = *iter;
829 iter++;
830 }
831 }
832 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
833 _PyUnicode_GET_LENGTH(unicode)));
834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835}
836#endif
837
Victor Stinnercd9950f2011-10-02 00:34:53 +0200838static int
839_PyUnicode_Dirty(PyObject *unicode)
840{
Victor Stinner910337b2011-10-03 03:20:16 +0200841 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200842 if (Py_REFCNT(unicode) != 1) {
843 PyErr_SetString(PyExc_ValueError,
844 "Cannot modify a string having more than 1 reference");
845 return -1;
846 }
847 _PyUnicode_DIRTY(unicode);
848 return 0;
849}
850
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200851Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
853 PyObject *from, Py_ssize_t from_start,
854 Py_ssize_t how_many)
855{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200856 unsigned int from_kind, to_kind;
857 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858
Victor Stinnerb1536152011-09-30 02:26:10 +0200859 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
860 PyErr_BadInternalCall();
861 return -1;
862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200863
864 if (PyUnicode_READY(from))
865 return -1;
866 if (PyUnicode_READY(to))
867 return -1;
868
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200869 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200870 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
871 PyErr_Format(PyExc_ValueError,
872 "Cannot write %zi characters at %zi "
873 "in a string of %zi characters",
874 how_many, to_start, PyUnicode_GET_LENGTH(to));
875 return -1;
876 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200877 if (how_many == 0)
878 return 0;
879
Victor Stinnercd9950f2011-10-02 00:34:53 +0200880 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200881 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200884 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200886 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887
Victor Stinnerf42dc442011-10-02 23:33:16 +0200888 if (from_kind == to_kind
889 /* deny latin1 => ascii */
890 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
891 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200892 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200893 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200894 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200895 + PyUnicode_KIND_SIZE(from_kind, from_start),
896 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200898 else if (from_kind == PyUnicode_1BYTE_KIND
899 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200900 {
901 _PyUnicode_CONVERT_BYTES(
902 Py_UCS1, Py_UCS2,
903 PyUnicode_1BYTE_DATA(from) + from_start,
904 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
905 PyUnicode_2BYTE_DATA(to) + to_start
906 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200907 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200908 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200909 && to_kind == PyUnicode_4BYTE_KIND)
910 {
911 _PyUnicode_CONVERT_BYTES(
912 Py_UCS1, Py_UCS4,
913 PyUnicode_1BYTE_DATA(from) + from_start,
914 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
915 PyUnicode_4BYTE_DATA(to) + to_start
916 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200917 }
918 else if (from_kind == PyUnicode_2BYTE_KIND
919 && to_kind == PyUnicode_4BYTE_KIND)
920 {
921 _PyUnicode_CONVERT_BYTES(
922 Py_UCS2, Py_UCS4,
923 PyUnicode_2BYTE_DATA(from) + from_start,
924 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
925 PyUnicode_4BYTE_DATA(to) + to_start
926 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200927 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200928 else {
929 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930
931 /* check if max_char(from substring) <= max_char(to) */
932 if (from_kind > to_kind
933 /* latin1 => ascii */
934 || (PyUnicode_IS_COMPACT_ASCII(to)
935 && to_kind == PyUnicode_1BYTE_KIND
936 && !PyUnicode_IS_COMPACT_ASCII(from)))
937 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200938 /* slow path to check for character overflow */
939 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
940 Py_UCS4 ch, maxchar;
941 Py_ssize_t i;
942
943 maxchar = 0;
944 invalid_kinds = 0;
945 for (i=0; i < how_many; i++) {
946 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
947 if (ch > maxchar) {
948 maxchar = ch;
949 if (maxchar > to_maxchar) {
950 invalid_kinds = 1;
951 break;
952 }
953 }
954 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
955 }
956 }
957 else
958 invalid_kinds = 1;
959 if (invalid_kinds) {
960 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200961 "Cannot copy %s characters "
962 "into a string of %s characters",
963 unicode_kind_name(from),
964 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200965 return -1;
966 }
967 }
968 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969}
970
Victor Stinner17222162011-09-28 22:15:37 +0200971/* Find the maximum code point and count the number of surrogate pairs so a
972 correct string length can be computed before converting a string to UCS4.
973 This function counts single surrogates as a character and not as a pair.
974
975 Return 0 on success, or -1 on error. */
976static int
977find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
978 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979{
980 const wchar_t *iter;
981
Victor Stinnerc53be962011-10-02 21:33:54 +0200982 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 if (num_surrogates == NULL || maxchar == NULL) {
984 PyErr_SetString(PyExc_SystemError,
985 "unexpected NULL arguments to "
986 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
987 return -1;
988 }
989
990 *num_surrogates = 0;
991 *maxchar = 0;
992
993 for (iter = begin; iter < end; ) {
994 if (*iter > *maxchar)
995 *maxchar = *iter;
996#if SIZEOF_WCHAR_T == 2
997 if (*iter >= 0xD800 && *iter <= 0xDBFF
998 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
999 {
1000 Py_UCS4 surrogate_val;
1001 surrogate_val = (((iter[0] & 0x3FF)<<10)
1002 | (iter[1] & 0x3FF)) + 0x10000;
1003 ++(*num_surrogates);
1004 if (surrogate_val > *maxchar)
1005 *maxchar = surrogate_val;
1006 iter += 2;
1007 }
1008 else
1009 iter++;
1010#else
1011 iter++;
1012#endif
1013 }
1014 return 0;
1015}
1016
1017#ifdef Py_DEBUG
1018int unicode_ready_calls = 0;
1019#endif
1020
1021int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001022_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001024 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001025 wchar_t *end;
1026 Py_UCS4 maxchar = 0;
1027 Py_ssize_t num_surrogates;
1028#if SIZEOF_WCHAR_T == 2
1029 Py_ssize_t length_wo_surrogates;
1030#endif
1031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001033 strings were created using _PyObject_New() and where no canonical
1034 representation (the str field) has been set yet aka strings
1035 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001036 assert(_PyUnicode_CHECK(unicode));
1037 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001039 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001040 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001041 /* Actually, it should neither be interned nor be anything else: */
1042 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043
1044#ifdef Py_DEBUG
1045 ++unicode_ready_calls;
1046#endif
1047
1048 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001049 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001050 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
1053 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001054 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1055 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 PyErr_NoMemory();
1057 return -1;
1058 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001059 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 _PyUnicode_WSTR(unicode), end,
1061 PyUnicode_1BYTE_DATA(unicode));
1062 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1063 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1064 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1065 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001066 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001067 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 }
1069 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001070 _PyUnicode_UTF8(unicode) = NULL;
1071 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 }
1073 PyObject_FREE(_PyUnicode_WSTR(unicode));
1074 _PyUnicode_WSTR(unicode) = NULL;
1075 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1076 }
1077 /* In this case we might have to convert down from 4-byte native
1078 wchar_t to 2-byte unicode. */
1079 else if (maxchar < 65536) {
1080 assert(num_surrogates == 0 &&
1081 "FindMaxCharAndNumSurrogatePairs() messed up");
1082
Victor Stinner506f5922011-09-28 22:34:18 +02001083#if SIZEOF_WCHAR_T == 2
1084 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001085 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001086 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1087 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1088 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001089 _PyUnicode_UTF8(unicode) = NULL;
1090 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001091#else
1092 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001093 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001094 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001095 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001096 PyErr_NoMemory();
1097 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 }
Victor Stinner506f5922011-09-28 22:34:18 +02001099 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1100 _PyUnicode_WSTR(unicode), end,
1101 PyUnicode_2BYTE_DATA(unicode));
1102 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1103 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1104 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001105 _PyUnicode_UTF8(unicode) = NULL;
1106 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001107 PyObject_FREE(_PyUnicode_WSTR(unicode));
1108 _PyUnicode_WSTR(unicode) = NULL;
1109 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1110#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111 }
1112 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1113 else {
1114#if SIZEOF_WCHAR_T == 2
1115 /* in case the native representation is 2-bytes, we need to allocate a
1116 new normalized 4-byte version. */
1117 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001118 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1119 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 PyErr_NoMemory();
1121 return -1;
1122 }
1123 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1124 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001125 _PyUnicode_UTF8(unicode) = NULL;
1126 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001127 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1128 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001129 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130 PyObject_FREE(_PyUnicode_WSTR(unicode));
1131 _PyUnicode_WSTR(unicode) = NULL;
1132 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1133#else
1134 assert(num_surrogates == 0);
1135
Victor Stinnerc3c74152011-10-02 20:39:55 +02001136 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001138 _PyUnicode_UTF8(unicode) = NULL;
1139 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001140 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1141#endif
1142 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1143 }
1144 _PyUnicode_STATE(unicode).ready = 1;
1145 return 0;
1146}
1147
Alexander Belopolsky40018472011-02-26 01:02:56 +00001148static void
1149unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150{
Walter Dörwald16807132007-05-25 13:52:07 +00001151 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001152 case SSTATE_NOT_INTERNED:
1153 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001154
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 case SSTATE_INTERNED_MORTAL:
1156 /* revive dead object temporarily for DelItem */
1157 Py_REFCNT(unicode) = 3;
1158 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1159 Py_FatalError(
1160 "deletion of interned string failed");
1161 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001162
Benjamin Peterson29060642009-01-31 22:14:21 +00001163 case SSTATE_INTERNED_IMMORTAL:
1164 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001165
Benjamin Peterson29060642009-01-31 22:14:21 +00001166 default:
1167 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001168 }
1169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170 if (_PyUnicode_WSTR(unicode) &&
1171 (!PyUnicode_IS_READY(unicode) ||
1172 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1173 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001174 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001175 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176
1177 if (PyUnicode_IS_COMPACT(unicode)) {
1178 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 }
1180 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001181 if (_PyUnicode_DATA_ANY(unicode))
1182 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 }
1185}
1186
Alexander Belopolsky40018472011-02-26 01:02:56 +00001187static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001189{
Victor Stinnera3be6132011-10-03 02:16:37 +02001190 Py_ssize_t len;
Victor Stinnerca4f7a42011-10-03 04:18:04 +02001191#if SIZEOF_WCHAR_T == 2
1192 /* FIXME: unicode_resize() is buggy on Windows */
1193 return 0;
1194#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 if (Py_REFCNT(unicode) != 1)
1196 return 0;
1197 if (PyUnicode_CHECK_INTERNED(unicode))
1198 return 0;
1199 if (unicode == unicode_empty)
1200 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001201 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1202 len = PyUnicode_WSTR_LENGTH(unicode);
1203 else
1204 len = PyUnicode_GET_LENGTH(unicode);
1205 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001206 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001207 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001209 else
1210 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211 if (ch < 256 && unicode_latin1[ch] == unicode)
1212 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001213 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001214 /* FIXME: reenable resize_inplace */
1215 if (!PyUnicode_IS_COMPACT(unicode))
1216 return 0;
1217 return 1;
1218}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001219
Victor Stinnerfe226c02011-10-03 03:52:20 +02001220static int
1221unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1222{
1223 PyObject *unicode;
1224 Py_ssize_t old_length;
1225
1226 assert(p_unicode != NULL);
1227 unicode = *p_unicode;
1228
1229 assert(unicode != NULL);
1230 assert(PyUnicode_Check(unicode));
1231 assert(0 <= length);
1232
Victor Stinner910337b2011-10-03 03:20:16 +02001233 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001234 old_length = PyUnicode_WSTR_LENGTH(unicode);
1235 else
1236 old_length = PyUnicode_GET_LENGTH(unicode);
1237 if (old_length == length)
1238 return 0;
1239
1240 /* FIXME: really create a new object? */
1241 if (!unicode_resizable(unicode)) {
1242 PyObject *copy = resize_copy(unicode, length);
1243 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001245 Py_DECREF(*p_unicode);
1246 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001247 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001248 }
1249
Victor Stinnerfe226c02011-10-03 03:52:20 +02001250 if (PyUnicode_IS_COMPACT(unicode)) {
1251 *p_unicode = resize_compact(unicode, length);
1252 if (*p_unicode == NULL)
1253 return -1;
1254 return 0;
1255 } else
1256 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001257}
1258
Alexander Belopolsky40018472011-02-26 01:02:56 +00001259int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001260PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001261{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001262 PyObject *unicode;
1263 if (p_unicode == NULL) {
1264 PyErr_BadInternalCall();
1265 return -1;
1266 }
1267 unicode = *p_unicode;
1268 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1269 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1270 {
1271 PyErr_BadInternalCall();
1272 return -1;
1273 }
1274 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001275}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001277static PyObject*
1278get_latin1_char(unsigned char ch)
1279{
Victor Stinnera464fc12011-10-02 20:39:30 +02001280 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001281 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001282 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001283 if (!unicode)
1284 return NULL;
1285 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1286 unicode_latin1[ch] = unicode;
1287 }
1288 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001289 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001290}
1291
Alexander Belopolsky40018472011-02-26 01:02:56 +00001292PyObject *
1293PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294{
1295 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296 Py_UCS4 maxchar = 0;
1297 Py_ssize_t num_surrogates;
1298
1299 if (u == NULL)
1300 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001302 /* If the Unicode data is known at construction time, we can apply
1303 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 /* Optimization for empty strings */
1306 if (size == 0 && unicode_empty != NULL) {
1307 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001308 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001309 }
Tim Petersced69f82003-09-16 20:30:58 +00001310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 /* Single character Unicode objects in the Latin-1 range are
1312 shared when using this constructor */
1313 if (size == 1 && *u < 256)
1314 return get_latin1_char((unsigned char)*u);
1315
1316 /* If not empty and not single character, copy the Unicode data
1317 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001318 if (find_maxchar_surrogates(u, u + size,
1319 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 return NULL;
1321
1322 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1323 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 if (!unicode)
1325 return NULL;
1326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 switch (PyUnicode_KIND(unicode)) {
1328 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001329 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1331 break;
1332 case PyUnicode_2BYTE_KIND:
1333#if Py_UNICODE_SIZE == 2
1334 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1335#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001336 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1338#endif
1339 break;
1340 case PyUnicode_4BYTE_KIND:
1341#if SIZEOF_WCHAR_T == 2
1342 /* This is the only case which has to process surrogates, thus
1343 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001344 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345#else
1346 assert(num_surrogates == 0);
1347 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1348#endif
1349 break;
1350 default:
1351 assert(0 && "Impossible state");
1352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353
1354 return (PyObject *)unicode;
1355}
1356
Alexander Belopolsky40018472011-02-26 01:02:56 +00001357PyObject *
1358PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001359{
1360 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001361
Benjamin Peterson14339b62009-01-31 16:36:08 +00001362 if (size < 0) {
1363 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001364 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001365 return NULL;
1366 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001367
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001368 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001369 some optimizations which share commonly used objects.
1370 Also, this means the input must be UTF-8, so fall back to the
1371 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001372 if (u != NULL) {
1373
Benjamin Peterson29060642009-01-31 22:14:21 +00001374 /* Optimization for empty strings */
1375 if (size == 0 && unicode_empty != NULL) {
1376 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001377 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001378 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001379
1380 /* Single characters are shared when using this constructor.
1381 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 if (size == 1 && Py_CHARMASK(*u) < 128)
1383 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001384
1385 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001386 }
1387
Walter Dörwald55507312007-05-18 13:12:10 +00001388 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001389 if (!unicode)
1390 return NULL;
1391
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001392 return (PyObject *)unicode;
1393}
1394
Alexander Belopolsky40018472011-02-26 01:02:56 +00001395PyObject *
1396PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001397{
1398 size_t size = strlen(u);
1399 if (size > PY_SSIZE_T_MAX) {
1400 PyErr_SetString(PyExc_OverflowError, "input too long");
1401 return NULL;
1402 }
1403
1404 return PyUnicode_FromStringAndSize(u, size);
1405}
1406
Victor Stinnere57b1c02011-09-28 22:20:48 +02001407static PyObject*
1408_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 PyObject *res;
1411 unsigned char max = 127;
1412 Py_ssize_t i;
1413 for (i = 0; i < size; i++) {
1414 if (u[i] & 0x80) {
1415 max = 255;
1416 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001417 }
1418 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 res = PyUnicode_New(size, max);
1420 if (!res)
1421 return NULL;
1422 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1423 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001424}
1425
Victor Stinnere57b1c02011-09-28 22:20:48 +02001426static PyObject*
1427_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428{
1429 PyObject *res;
1430 Py_UCS2 max = 0;
1431 Py_ssize_t i;
1432 for (i = 0; i < size; i++)
1433 if (u[i] > max)
1434 max = u[i];
1435 res = PyUnicode_New(size, max);
1436 if (!res)
1437 return NULL;
1438 if (max >= 256)
1439 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1440 else
1441 for (i = 0; i < size; i++)
1442 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1443 return res;
1444}
1445
Victor Stinnere57b1c02011-09-28 22:20:48 +02001446static PyObject*
1447_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448{
1449 PyObject *res;
1450 Py_UCS4 max = 0;
1451 Py_ssize_t i;
1452 for (i = 0; i < size; i++)
1453 if (u[i] > max)
1454 max = u[i];
1455 res = PyUnicode_New(size, max);
1456 if (!res)
1457 return NULL;
1458 if (max >= 0x10000)
1459 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1460 else {
1461 int kind = PyUnicode_KIND(res);
1462 void *data = PyUnicode_DATA(res);
1463 for (i = 0; i < size; i++)
1464 PyUnicode_WRITE(kind, data, i, u[i]);
1465 }
1466 return res;
1467}
1468
1469PyObject*
1470PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1471{
1472 switch(kind) {
1473 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001474 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001476 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001478 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001480 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 return NULL;
1482}
1483
Victor Stinner034f6cf2011-09-30 02:26:44 +02001484PyObject*
1485PyUnicode_Copy(PyObject *unicode)
1486{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001487 Py_ssize_t size;
1488 PyObject *copy;
1489 void *data;
1490
Victor Stinner034f6cf2011-09-30 02:26:44 +02001491 if (!PyUnicode_Check(unicode)) {
1492 PyErr_BadInternalCall();
1493 return NULL;
1494 }
1495 if (PyUnicode_READY(unicode))
1496 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001497
1498 size = PyUnicode_GET_LENGTH(unicode);
1499 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1500 if (!copy)
1501 return NULL;
1502 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1503
1504 data = PyUnicode_DATA(unicode);
1505 switch (PyUnicode_KIND(unicode))
1506 {
1507 case PyUnicode_1BYTE_KIND:
1508 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1509 break;
1510 case PyUnicode_2BYTE_KIND:
1511 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1512 break;
1513 case PyUnicode_4BYTE_KIND:
1514 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1515 break;
1516 default:
1517 assert(0);
1518 break;
1519 }
1520 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001521}
1522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523
Victor Stinnerbc603d12011-10-02 01:00:40 +02001524/* Widen Unicode objects to larger buffers. Don't write terminating null
1525 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526
1527void*
1528_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1529{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001530 Py_ssize_t len;
1531 void *result;
1532 unsigned int skind;
1533
1534 if (PyUnicode_READY(s))
1535 return NULL;
1536
1537 len = PyUnicode_GET_LENGTH(s);
1538 skind = PyUnicode_KIND(s);
1539 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1541 return NULL;
1542 }
1543 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001544 case PyUnicode_2BYTE_KIND:
1545 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1546 if (!result)
1547 return PyErr_NoMemory();
1548 assert(skind == PyUnicode_1BYTE_KIND);
1549 _PyUnicode_CONVERT_BYTES(
1550 Py_UCS1, Py_UCS2,
1551 PyUnicode_1BYTE_DATA(s),
1552 PyUnicode_1BYTE_DATA(s) + len,
1553 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001555 case PyUnicode_4BYTE_KIND:
1556 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1557 if (!result)
1558 return PyErr_NoMemory();
1559 if (skind == PyUnicode_2BYTE_KIND) {
1560 _PyUnicode_CONVERT_BYTES(
1561 Py_UCS2, Py_UCS4,
1562 PyUnicode_2BYTE_DATA(s),
1563 PyUnicode_2BYTE_DATA(s) + len,
1564 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001566 else {
1567 assert(skind == PyUnicode_1BYTE_KIND);
1568 _PyUnicode_CONVERT_BYTES(
1569 Py_UCS1, Py_UCS4,
1570 PyUnicode_1BYTE_DATA(s),
1571 PyUnicode_1BYTE_DATA(s) + len,
1572 result);
1573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001575 default:
1576 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001578 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579 return NULL;
1580}
1581
1582static Py_UCS4*
1583as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1584 int copy_null)
1585{
1586 int kind;
1587 void *data;
1588 Py_ssize_t len, targetlen;
1589 if (PyUnicode_READY(string) == -1)
1590 return NULL;
1591 kind = PyUnicode_KIND(string);
1592 data = PyUnicode_DATA(string);
1593 len = PyUnicode_GET_LENGTH(string);
1594 targetlen = len;
1595 if (copy_null)
1596 targetlen++;
1597 if (!target) {
1598 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1599 PyErr_NoMemory();
1600 return NULL;
1601 }
1602 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1603 if (!target) {
1604 PyErr_NoMemory();
1605 return NULL;
1606 }
1607 }
1608 else {
1609 if (targetsize < targetlen) {
1610 PyErr_Format(PyExc_SystemError,
1611 "string is longer than the buffer");
1612 if (copy_null && 0 < targetsize)
1613 target[0] = 0;
1614 return NULL;
1615 }
1616 }
1617 if (kind != PyUnicode_4BYTE_KIND) {
1618 Py_ssize_t i;
1619 for (i = 0; i < len; i++)
1620 target[i] = PyUnicode_READ(kind, data, i);
1621 }
1622 else
1623 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1624 if (copy_null)
1625 target[len] = 0;
1626 return target;
1627}
1628
1629Py_UCS4*
1630PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1631 int copy_null)
1632{
1633 if (target == NULL || targetsize < 1) {
1634 PyErr_BadInternalCall();
1635 return NULL;
1636 }
1637 return as_ucs4(string, target, targetsize, copy_null);
1638}
1639
1640Py_UCS4*
1641PyUnicode_AsUCS4Copy(PyObject *string)
1642{
1643 return as_ucs4(string, NULL, 0, 1);
1644}
1645
1646#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001647
Alexander Belopolsky40018472011-02-26 01:02:56 +00001648PyObject *
1649PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001652 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001653 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 PyErr_BadInternalCall();
1655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001656 }
1657
Martin v. Löwis790465f2008-04-05 20:41:37 +00001658 if (size == -1) {
1659 size = wcslen(w);
1660 }
1661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001663}
1664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001666
Walter Dörwald346737f2007-05-31 10:44:43 +00001667static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001668makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1669 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001670{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001671 *fmt++ = '%';
1672 if (width) {
1673 if (zeropad)
1674 *fmt++ = '0';
1675 fmt += sprintf(fmt, "%d", width);
1676 }
1677 if (precision)
1678 fmt += sprintf(fmt, ".%d", precision);
1679 if (longflag)
1680 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001681 else if (longlongflag) {
1682 /* longlongflag should only ever be nonzero on machines with
1683 HAVE_LONG_LONG defined */
1684#ifdef HAVE_LONG_LONG
1685 char *f = PY_FORMAT_LONG_LONG;
1686 while (*f)
1687 *fmt++ = *f++;
1688#else
1689 /* we shouldn't ever get here */
1690 assert(0);
1691 *fmt++ = 'l';
1692#endif
1693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001694 else if (size_tflag) {
1695 char *f = PY_FORMAT_SIZE_T;
1696 while (*f)
1697 *fmt++ = *f++;
1698 }
1699 *fmt++ = c;
1700 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001701}
1702
Victor Stinner96865452011-03-01 23:44:09 +00001703/* helper for PyUnicode_FromFormatV() */
1704
1705static const char*
1706parse_format_flags(const char *f,
1707 int *p_width, int *p_precision,
1708 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1709{
1710 int width, precision, longflag, longlongflag, size_tflag;
1711
1712 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1713 f++;
1714 width = 0;
1715 while (Py_ISDIGIT((unsigned)*f))
1716 width = (width*10) + *f++ - '0';
1717 precision = 0;
1718 if (*f == '.') {
1719 f++;
1720 while (Py_ISDIGIT((unsigned)*f))
1721 precision = (precision*10) + *f++ - '0';
1722 if (*f == '%') {
1723 /* "%.3%s" => f points to "3" */
1724 f--;
1725 }
1726 }
1727 if (*f == '\0') {
1728 /* bogus format "%.1" => go backward, f points to "1" */
1729 f--;
1730 }
1731 if (p_width != NULL)
1732 *p_width = width;
1733 if (p_precision != NULL)
1734 *p_precision = precision;
1735
1736 /* Handle %ld, %lu, %lld and %llu. */
1737 longflag = 0;
1738 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001739 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001740
1741 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001742 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001743 longflag = 1;
1744 ++f;
1745 }
1746#ifdef HAVE_LONG_LONG
1747 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001748 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001749 longlongflag = 1;
1750 f += 2;
1751 }
1752#endif
1753 }
1754 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001755 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001756 size_tflag = 1;
1757 ++f;
1758 }
1759 if (p_longflag != NULL)
1760 *p_longflag = longflag;
1761 if (p_longlongflag != NULL)
1762 *p_longlongflag = longlongflag;
1763 if (p_size_tflag != NULL)
1764 *p_size_tflag = size_tflag;
1765 return f;
1766}
1767
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001768/* maximum number of characters required for output of %ld. 21 characters
1769 allows for 64-bit integers (in decimal) and an optional sign. */
1770#define MAX_LONG_CHARS 21
1771/* maximum number of characters required for output of %lld.
1772 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1773 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1774#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1775
Walter Dörwaldd2034312007-05-18 16:29:38 +00001776PyObject *
1777PyUnicode_FromFormatV(const char *format, va_list vargs)
1778{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001779 va_list count;
1780 Py_ssize_t callcount = 0;
1781 PyObject **callresults = NULL;
1782 PyObject **callresult = NULL;
1783 Py_ssize_t n = 0;
1784 int width = 0;
1785 int precision = 0;
1786 int zeropad;
1787 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001790 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1792 Py_UCS4 argmaxchar;
1793 Py_ssize_t numbersize = 0;
1794 char *numberresults = NULL;
1795 char *numberresult = NULL;
1796 Py_ssize_t i;
1797 int kind;
1798 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001799
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001800 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001801 /* step 1: count the number of %S/%R/%A/%s format specifications
1802 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1803 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 * result in an array)
1805 * also esimate a upper bound for all the number formats in the string,
1806 * numbers will be formated in step 3 and be keept in a '\0'-separated
1807 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001808 for (f = format; *f; f++) {
1809 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001810 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1812 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1813 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1814 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001817#ifdef HAVE_LONG_LONG
1818 if (longlongflag) {
1819 if (width < MAX_LONG_LONG_CHARS)
1820 width = MAX_LONG_LONG_CHARS;
1821 }
1822 else
1823#endif
1824 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1825 including sign. Decimal takes the most space. This
1826 isn't enough for octal. If a width is specified we
1827 need more (which we allocate later). */
1828 if (width < MAX_LONG_CHARS)
1829 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830
1831 /* account for the size + '\0' to separate numbers
1832 inside of the numberresults buffer */
1833 numbersize += (width + 1);
1834 }
1835 }
1836 else if ((unsigned char)*f > 127) {
1837 PyErr_Format(PyExc_ValueError,
1838 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1839 "string, got a non-ASCII byte: 0x%02x",
1840 (unsigned char)*f);
1841 return NULL;
1842 }
1843 }
1844 /* step 2: allocate memory for the results of
1845 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1846 if (callcount) {
1847 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1848 if (!callresults) {
1849 PyErr_NoMemory();
1850 return NULL;
1851 }
1852 callresult = callresults;
1853 }
1854 /* step 2.5: allocate memory for the results of formating numbers */
1855 if (numbersize) {
1856 numberresults = PyObject_Malloc(numbersize);
1857 if (!numberresults) {
1858 PyErr_NoMemory();
1859 goto fail;
1860 }
1861 numberresult = numberresults;
1862 }
1863
1864 /* step 3: format numbers and figure out how large a buffer we need */
1865 for (f = format; *f; f++) {
1866 if (*f == '%') {
1867 const char* p;
1868 int longflag;
1869 int longlongflag;
1870 int size_tflag;
1871 int numprinted;
1872
1873 p = f;
1874 zeropad = (f[1] == '0');
1875 f = parse_format_flags(f, &width, &precision,
1876 &longflag, &longlongflag, &size_tflag);
1877 switch (*f) {
1878 case 'c':
1879 {
1880 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001881 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001882 n++;
1883 break;
1884 }
1885 case '%':
1886 n++;
1887 break;
1888 case 'i':
1889 case 'd':
1890 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1891 width, precision, *f);
1892 if (longflag)
1893 numprinted = sprintf(numberresult, fmt,
1894 va_arg(count, long));
1895#ifdef HAVE_LONG_LONG
1896 else if (longlongflag)
1897 numprinted = sprintf(numberresult, fmt,
1898 va_arg(count, PY_LONG_LONG));
1899#endif
1900 else if (size_tflag)
1901 numprinted = sprintf(numberresult, fmt,
1902 va_arg(count, Py_ssize_t));
1903 else
1904 numprinted = sprintf(numberresult, fmt,
1905 va_arg(count, int));
1906 n += numprinted;
1907 /* advance by +1 to skip over the '\0' */
1908 numberresult += (numprinted + 1);
1909 assert(*(numberresult - 1) == '\0');
1910 assert(*(numberresult - 2) != '\0');
1911 assert(numprinted >= 0);
1912 assert(numberresult <= numberresults + numbersize);
1913 break;
1914 case 'u':
1915 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1916 width, precision, 'u');
1917 if (longflag)
1918 numprinted = sprintf(numberresult, fmt,
1919 va_arg(count, unsigned long));
1920#ifdef HAVE_LONG_LONG
1921 else if (longlongflag)
1922 numprinted = sprintf(numberresult, fmt,
1923 va_arg(count, unsigned PY_LONG_LONG));
1924#endif
1925 else if (size_tflag)
1926 numprinted = sprintf(numberresult, fmt,
1927 va_arg(count, size_t));
1928 else
1929 numprinted = sprintf(numberresult, fmt,
1930 va_arg(count, unsigned int));
1931 n += numprinted;
1932 numberresult += (numprinted + 1);
1933 assert(*(numberresult - 1) == '\0');
1934 assert(*(numberresult - 2) != '\0');
1935 assert(numprinted >= 0);
1936 assert(numberresult <= numberresults + numbersize);
1937 break;
1938 case 'x':
1939 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1940 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1941 n += numprinted;
1942 numberresult += (numprinted + 1);
1943 assert(*(numberresult - 1) == '\0');
1944 assert(*(numberresult - 2) != '\0');
1945 assert(numprinted >= 0);
1946 assert(numberresult <= numberresults + numbersize);
1947 break;
1948 case 'p':
1949 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1950 /* %p is ill-defined: ensure leading 0x. */
1951 if (numberresult[1] == 'X')
1952 numberresult[1] = 'x';
1953 else if (numberresult[1] != 'x') {
1954 memmove(numberresult + 2, numberresult,
1955 strlen(numberresult) + 1);
1956 numberresult[0] = '0';
1957 numberresult[1] = 'x';
1958 numprinted += 2;
1959 }
1960 n += numprinted;
1961 numberresult += (numprinted + 1);
1962 assert(*(numberresult - 1) == '\0');
1963 assert(*(numberresult - 2) != '\0');
1964 assert(numprinted >= 0);
1965 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001966 break;
1967 case 's':
1968 {
1969 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001970 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001971 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1972 if (!str)
1973 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 /* since PyUnicode_DecodeUTF8 returns already flexible
1975 unicode objects, there is no need to call ready on them */
1976 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001977 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001979 /* Remember the str and switch to the next slot */
1980 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001981 break;
1982 }
1983 case 'U':
1984 {
1985 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001986 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 if (PyUnicode_READY(obj) == -1)
1988 goto fail;
1989 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001990 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001992 break;
1993 }
1994 case 'V':
1995 {
1996 PyObject *obj = va_arg(count, PyObject *);
1997 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001998 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001999 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002000 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002001 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (PyUnicode_READY(obj) == -1)
2003 goto fail;
2004 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002005 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002007 *callresult++ = NULL;
2008 }
2009 else {
2010 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2011 if (!str_obj)
2012 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002014 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002016 *callresult++ = str_obj;
2017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002018 break;
2019 }
2020 case 'S':
2021 {
2022 PyObject *obj = va_arg(count, PyObject *);
2023 PyObject *str;
2024 assert(obj);
2025 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002027 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002029 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002031 /* Remember the str and switch to the next slot */
2032 *callresult++ = str;
2033 break;
2034 }
2035 case 'R':
2036 {
2037 PyObject *obj = va_arg(count, PyObject *);
2038 PyObject *repr;
2039 assert(obj);
2040 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002042 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002044 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002046 /* Remember the repr and switch to the next slot */
2047 *callresult++ = repr;
2048 break;
2049 }
2050 case 'A':
2051 {
2052 PyObject *obj = va_arg(count, PyObject *);
2053 PyObject *ascii;
2054 assert(obj);
2055 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002056 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002057 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002059 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002061 /* Remember the repr and switch to the next slot */
2062 *callresult++ = ascii;
2063 break;
2064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002065 default:
2066 /* if we stumble upon an unknown
2067 formatting code, copy the rest of
2068 the format string to the output
2069 string. (we cannot just skip the
2070 code, since there's no way to know
2071 what's in the argument list) */
2072 n += strlen(p);
2073 goto expand;
2074 }
2075 } else
2076 n++;
2077 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 we don't have to resize the string.
2082 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002084 if (!string)
2085 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002086 kind = PyUnicode_KIND(string);
2087 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002088 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002092 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002093 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002094
2095 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2097 /* checking for == because the last argument could be a empty
2098 string, which causes i to point to end, the assert at the end of
2099 the loop */
2100 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002101
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 switch (*f) {
2103 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002104 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002105 const int ordinal = va_arg(vargs, int);
2106 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002107 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002108 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002109 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002110 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002111 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002112 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 case 'p':
2114 /* unused, since we already have the result */
2115 if (*f == 'p')
2116 (void) va_arg(vargs, void *);
2117 else
2118 (void) va_arg(vargs, int);
2119 /* extract the result from numberresults and append. */
2120 for (; *numberresult; ++i, ++numberresult)
2121 PyUnicode_WRITE(kind, data, i, *numberresult);
2122 /* skip over the separating '\0' */
2123 assert(*numberresult == '\0');
2124 numberresult++;
2125 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002126 break;
2127 case 's':
2128 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002129 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002131 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002132 size = PyUnicode_GET_LENGTH(*callresult);
2133 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002134 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2135 *callresult, 0,
2136 size) < 0)
2137 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002139 /* We're done with the unicode()/repr() => forget it */
2140 Py_DECREF(*callresult);
2141 /* switch to next unicode()/repr() result */
2142 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002143 break;
2144 }
2145 case 'U':
2146 {
2147 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_ssize_t size;
2149 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2150 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002151 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2152 obj, 0,
2153 size) < 0)
2154 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002156 break;
2157 }
2158 case 'V':
2159 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002162 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002163 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 size = PyUnicode_GET_LENGTH(obj);
2165 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002166 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2167 obj, 0,
2168 size) < 0)
2169 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 size = PyUnicode_GET_LENGTH(*callresult);
2173 assert(PyUnicode_KIND(*callresult) <=
2174 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002175 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2176 *callresult,
2177 0, size) < 0)
2178 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002180 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002181 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002182 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 break;
2184 }
2185 case 'S':
2186 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002187 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002189 /* unused, since we already have the result */
2190 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002192 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2193 *callresult, 0,
2194 PyUnicode_GET_LENGTH(*callresult)) < 0)
2195 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002197 /* We're done with the unicode()/repr() => forget it */
2198 Py_DECREF(*callresult);
2199 /* switch to next unicode()/repr() result */
2200 ++callresult;
2201 break;
2202 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002205 break;
2206 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 for (; *p; ++p, ++i)
2208 PyUnicode_WRITE(kind, data, i, *p);
2209 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002210 goto end;
2211 }
Victor Stinner1205f272010-09-11 00:54:47 +00002212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 else {
2214 assert(i < PyUnicode_GET_LENGTH(string));
2215 PyUnicode_WRITE(kind, data, i++, *f);
2216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002219
Benjamin Peterson29060642009-01-31 22:14:21 +00002220 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 if (callresults)
2222 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (numberresults)
2224 PyObject_Free(numberresults);
2225 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002226 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002227 if (callresults) {
2228 PyObject **callresult2 = callresults;
2229 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002230 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002231 ++callresult2;
2232 }
2233 PyObject_Free(callresults);
2234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 if (numberresults)
2236 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002237 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002238}
2239
Walter Dörwaldd2034312007-05-18 16:29:38 +00002240PyObject *
2241PyUnicode_FromFormat(const char *format, ...)
2242{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002243 PyObject* ret;
2244 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002245
2246#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002248#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002249 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002250#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 ret = PyUnicode_FromFormatV(format, vargs);
2252 va_end(vargs);
2253 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002254}
2255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256#ifdef HAVE_WCHAR_H
2257
Victor Stinner5593d8a2010-10-02 11:11:27 +00002258/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2259 convert a Unicode object to a wide character string.
2260
Victor Stinnerd88d9832011-09-06 02:00:05 +02002261 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002262 character) required to convert the unicode object. Ignore size argument.
2263
Victor Stinnerd88d9832011-09-06 02:00:05 +02002264 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002265 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002266 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002267static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002268unicode_aswidechar(PyUnicodeObject *unicode,
2269 wchar_t *w,
2270 Py_ssize_t size)
2271{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002272 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 const wchar_t *wstr;
2274
2275 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2276 if (wstr == NULL)
2277 return -1;
2278
Victor Stinner5593d8a2010-10-02 11:11:27 +00002279 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002280 if (size > res)
2281 size = res + 1;
2282 else
2283 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002285 return res;
2286 }
2287 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002289}
2290
2291Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002292PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002293 wchar_t *w,
2294 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002295{
2296 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002297 PyErr_BadInternalCall();
2298 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002300 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301}
2302
Victor Stinner137c34c2010-09-29 10:25:54 +00002303wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002304PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002305 Py_ssize_t *size)
2306{
2307 wchar_t* buffer;
2308 Py_ssize_t buflen;
2309
2310 if (unicode == NULL) {
2311 PyErr_BadInternalCall();
2312 return NULL;
2313 }
2314
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002315 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 if (buflen == -1)
2317 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002318 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002319 PyErr_NoMemory();
2320 return NULL;
2321 }
2322
Victor Stinner137c34c2010-09-29 10:25:54 +00002323 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2324 if (buffer == NULL) {
2325 PyErr_NoMemory();
2326 return NULL;
2327 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002328 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 if (buflen == -1)
2330 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002331 if (size != NULL)
2332 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002333 return buffer;
2334}
2335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002336#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337
Alexander Belopolsky40018472011-02-26 01:02:56 +00002338PyObject *
2339PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002342 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002343 PyErr_SetString(PyExc_ValueError,
2344 "chr() arg not in range(0x110000)");
2345 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002346 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 if (ordinal < 256)
2349 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002351 v = PyUnicode_New(1, ordinal);
2352 if (v == NULL)
2353 return NULL;
2354 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2355 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002356}
2357
Alexander Belopolsky40018472011-02-26 01:02:56 +00002358PyObject *
2359PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002361 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002362 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002363 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002364 if (PyUnicode_READY(obj))
2365 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002366 Py_INCREF(obj);
2367 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002368 }
2369 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 /* For a Unicode subtype that's not a Unicode object,
2371 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002372 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002373 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002374 PyErr_Format(PyExc_TypeError,
2375 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002376 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002377 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002378}
2379
Alexander Belopolsky40018472011-02-26 01:02:56 +00002380PyObject *
2381PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002382 const char *encoding,
2383 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002384{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002385 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002386 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002387
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 PyErr_BadInternalCall();
2390 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002392
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002393 /* Decoding bytes objects is the most common case and should be fast */
2394 if (PyBytes_Check(obj)) {
2395 if (PyBytes_GET_SIZE(obj) == 0) {
2396 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002397 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002398 }
2399 else {
2400 v = PyUnicode_Decode(
2401 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2402 encoding, errors);
2403 }
2404 return v;
2405 }
2406
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002407 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002408 PyErr_SetString(PyExc_TypeError,
2409 "decoding str is not supported");
2410 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002411 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002412
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002413 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2414 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2415 PyErr_Format(PyExc_TypeError,
2416 "coercing to str: need bytes, bytearray "
2417 "or buffer-like object, %.80s found",
2418 Py_TYPE(obj)->tp_name);
2419 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002420 }
Tim Petersced69f82003-09-16 20:30:58 +00002421
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002422 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002423 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002424 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002425 }
Tim Petersced69f82003-09-16 20:30:58 +00002426 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002427 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002428
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002429 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002430 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002431}
2432
Victor Stinner600d3be2010-06-10 12:00:55 +00002433/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002434 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2435 1 on success. */
2436static int
2437normalize_encoding(const char *encoding,
2438 char *lower,
2439 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002440{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002441 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002442 char *l;
2443 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002444
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002445 e = encoding;
2446 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002447 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002448 while (*e) {
2449 if (l == l_end)
2450 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002451 if (Py_ISUPPER(*e)) {
2452 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002453 }
2454 else if (*e == '_') {
2455 *l++ = '-';
2456 e++;
2457 }
2458 else {
2459 *l++ = *e++;
2460 }
2461 }
2462 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002463 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002464}
2465
Alexander Belopolsky40018472011-02-26 01:02:56 +00002466PyObject *
2467PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002468 Py_ssize_t size,
2469 const char *encoding,
2470 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002471{
2472 PyObject *buffer = NULL, *unicode;
2473 Py_buffer info;
2474 char lower[11]; /* Enough for any encoding shortcut */
2475
2476 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002477 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002478
2479 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002480 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002481 if ((strcmp(lower, "utf-8") == 0) ||
2482 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002483 return PyUnicode_DecodeUTF8(s, size, errors);
2484 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002485 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002486 (strcmp(lower, "iso-8859-1") == 0))
2487 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002488#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002489 else if (strcmp(lower, "mbcs") == 0)
2490 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002491#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002492 else if (strcmp(lower, "ascii") == 0)
2493 return PyUnicode_DecodeASCII(s, size, errors);
2494 else if (strcmp(lower, "utf-16") == 0)
2495 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2496 else if (strcmp(lower, "utf-32") == 0)
2497 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002499
2500 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002501 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002502 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002503 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002504 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002505 if (buffer == NULL)
2506 goto onError;
2507 unicode = PyCodec_Decode(buffer, encoding, errors);
2508 if (unicode == NULL)
2509 goto onError;
2510 if (!PyUnicode_Check(unicode)) {
2511 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002512 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002513 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514 Py_DECREF(unicode);
2515 goto onError;
2516 }
2517 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 if (PyUnicode_READY(unicode)) {
2519 Py_DECREF(unicode);
2520 return NULL;
2521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002523
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525 Py_XDECREF(buffer);
2526 return NULL;
2527}
2528
Alexander Belopolsky40018472011-02-26 01:02:56 +00002529PyObject *
2530PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002531 const char *encoding,
2532 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002533{
2534 PyObject *v;
2535
2536 if (!PyUnicode_Check(unicode)) {
2537 PyErr_BadArgument();
2538 goto onError;
2539 }
2540
2541 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002542 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002543
2544 /* Decode via the codec registry */
2545 v = PyCodec_Decode(unicode, encoding, errors);
2546 if (v == NULL)
2547 goto onError;
2548 return v;
2549
Benjamin Peterson29060642009-01-31 22:14:21 +00002550 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002551 return NULL;
2552}
2553
Alexander Belopolsky40018472011-02-26 01:02:56 +00002554PyObject *
2555PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002556 const char *encoding,
2557 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002558{
2559 PyObject *v;
2560
2561 if (!PyUnicode_Check(unicode)) {
2562 PyErr_BadArgument();
2563 goto onError;
2564 }
2565
2566 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002567 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002568
2569 /* Decode via the codec registry */
2570 v = PyCodec_Decode(unicode, encoding, errors);
2571 if (v == NULL)
2572 goto onError;
2573 if (!PyUnicode_Check(v)) {
2574 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002575 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002576 Py_TYPE(v)->tp_name);
2577 Py_DECREF(v);
2578 goto onError;
2579 }
2580 return v;
2581
Benjamin Peterson29060642009-01-31 22:14:21 +00002582 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002583 return NULL;
2584}
2585
Alexander Belopolsky40018472011-02-26 01:02:56 +00002586PyObject *
2587PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002588 Py_ssize_t size,
2589 const char *encoding,
2590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591{
2592 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002593
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 unicode = PyUnicode_FromUnicode(s, size);
2595 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002596 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2598 Py_DECREF(unicode);
2599 return v;
2600}
2601
Alexander Belopolsky40018472011-02-26 01:02:56 +00002602PyObject *
2603PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002604 const char *encoding,
2605 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002606{
2607 PyObject *v;
2608
2609 if (!PyUnicode_Check(unicode)) {
2610 PyErr_BadArgument();
2611 goto onError;
2612 }
2613
2614 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002616
2617 /* Encode via the codec registry */
2618 v = PyCodec_Encode(unicode, encoding, errors);
2619 if (v == NULL)
2620 goto onError;
2621 return v;
2622
Benjamin Peterson29060642009-01-31 22:14:21 +00002623 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002624 return NULL;
2625}
2626
Victor Stinnerad158722010-10-27 00:25:46 +00002627PyObject *
2628PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002629{
Victor Stinner99b95382011-07-04 14:23:54 +02002630#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002631 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2632 PyUnicode_GET_SIZE(unicode),
2633 NULL);
2634#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002636#else
Victor Stinner793b5312011-04-27 00:24:21 +02002637 PyInterpreterState *interp = PyThreadState_GET()->interp;
2638 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2639 cannot use it to encode and decode filenames before it is loaded. Load
2640 the Python codec requires to encode at least its own filename. Use the C
2641 version of the locale codec until the codec registry is initialized and
2642 the Python codec is loaded.
2643
2644 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2645 cannot only rely on it: check also interp->fscodec_initialized for
2646 subinterpreters. */
2647 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002648 return PyUnicode_AsEncodedString(unicode,
2649 Py_FileSystemDefaultEncoding,
2650 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002651 }
2652 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002653 /* locale encoding with surrogateescape */
2654 wchar_t *wchar;
2655 char *bytes;
2656 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002657 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002658
2659 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2660 if (wchar == NULL)
2661 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002662 bytes = _Py_wchar2char(wchar, &error_pos);
2663 if (bytes == NULL) {
2664 if (error_pos != (size_t)-1) {
2665 char *errmsg = strerror(errno);
2666 PyObject *exc = NULL;
2667 if (errmsg == NULL)
2668 errmsg = "Py_wchar2char() failed";
2669 raise_encode_exception(&exc,
2670 "filesystemencoding",
2671 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2672 error_pos, error_pos+1,
2673 errmsg);
2674 Py_XDECREF(exc);
2675 }
2676 else
2677 PyErr_NoMemory();
2678 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002679 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002680 }
2681 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002682
2683 bytes_obj = PyBytes_FromString(bytes);
2684 PyMem_Free(bytes);
2685 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002686 }
Victor Stinnerad158722010-10-27 00:25:46 +00002687#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002688}
2689
Alexander Belopolsky40018472011-02-26 01:02:56 +00002690PyObject *
2691PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002692 const char *encoding,
2693 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694{
2695 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002696 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002697
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 if (!PyUnicode_Check(unicode)) {
2699 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 }
Fred Drakee4315f52000-05-09 19:53:39 +00002702
Victor Stinner2f283c22011-03-02 01:21:46 +00002703 if (encoding == NULL) {
2704 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002706 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002708 }
Fred Drakee4315f52000-05-09 19:53:39 +00002709
2710 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002711 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002712 if ((strcmp(lower, "utf-8") == 0) ||
2713 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002714 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002715 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002717 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002719 }
Victor Stinner37296e82010-06-10 13:36:23 +00002720 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002721 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002722 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002724#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002725 else if (strcmp(lower, "mbcs") == 0)
2726 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2727 PyUnicode_GET_SIZE(unicode),
2728 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002729#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002730 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733
2734 /* Encode via the codec registry */
2735 v = PyCodec_Encode(unicode, encoding, errors);
2736 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002737 return NULL;
2738
2739 /* The normal path */
2740 if (PyBytes_Check(v))
2741 return v;
2742
2743 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002744 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002745 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002746 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002747
2748 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2749 "encoder %s returned bytearray instead of bytes",
2750 encoding);
2751 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002752 Py_DECREF(v);
2753 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002754 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002755
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002756 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2757 Py_DECREF(v);
2758 return b;
2759 }
2760
2761 PyErr_Format(PyExc_TypeError,
2762 "encoder did not return a bytes object (type=%.400s)",
2763 Py_TYPE(v)->tp_name);
2764 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002765 return NULL;
2766}
2767
Alexander Belopolsky40018472011-02-26 01:02:56 +00002768PyObject *
2769PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002770 const char *encoding,
2771 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772{
2773 PyObject *v;
2774
2775 if (!PyUnicode_Check(unicode)) {
2776 PyErr_BadArgument();
2777 goto onError;
2778 }
2779
2780 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002781 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002782
2783 /* Encode via the codec registry */
2784 v = PyCodec_Encode(unicode, encoding, errors);
2785 if (v == NULL)
2786 goto onError;
2787 if (!PyUnicode_Check(v)) {
2788 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002789 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002790 Py_TYPE(v)->tp_name);
2791 Py_DECREF(v);
2792 goto onError;
2793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002795
Benjamin Peterson29060642009-01-31 22:14:21 +00002796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 return NULL;
2798}
2799
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002800PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002801PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002802 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002803 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2804}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002805
Christian Heimes5894ba72007-11-04 11:43:14 +00002806PyObject*
2807PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2808{
Victor Stinner99b95382011-07-04 14:23:54 +02002809#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002810 return PyUnicode_DecodeMBCS(s, size, NULL);
2811#elif defined(__APPLE__)
2812 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2813#else
Victor Stinner793b5312011-04-27 00:24:21 +02002814 PyInterpreterState *interp = PyThreadState_GET()->interp;
2815 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2816 cannot use it to encode and decode filenames before it is loaded. Load
2817 the Python codec requires to encode at least its own filename. Use the C
2818 version of the locale codec until the codec registry is initialized and
2819 the Python codec is loaded.
2820
2821 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2822 cannot only rely on it: check also interp->fscodec_initialized for
2823 subinterpreters. */
2824 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002825 return PyUnicode_Decode(s, size,
2826 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002827 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002828 }
2829 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002830 /* locale encoding with surrogateescape */
2831 wchar_t *wchar;
2832 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002833 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002834
2835 if (s[size] != '\0' || size != strlen(s)) {
2836 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2837 return NULL;
2838 }
2839
Victor Stinner168e1172010-10-16 23:16:16 +00002840 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002841 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002842 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002843
Victor Stinner168e1172010-10-16 23:16:16 +00002844 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002845 PyMem_Free(wchar);
2846 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002847 }
Victor Stinnerad158722010-10-27 00:25:46 +00002848#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002849}
2850
Martin v. Löwis011e8422009-05-05 04:43:17 +00002851
2852int
2853PyUnicode_FSConverter(PyObject* arg, void* addr)
2854{
2855 PyObject *output = NULL;
2856 Py_ssize_t size;
2857 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002858 if (arg == NULL) {
2859 Py_DECREF(*(PyObject**)addr);
2860 return 1;
2861 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002862 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002863 output = arg;
2864 Py_INCREF(output);
2865 }
2866 else {
2867 arg = PyUnicode_FromObject(arg);
2868 if (!arg)
2869 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002870 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002871 Py_DECREF(arg);
2872 if (!output)
2873 return 0;
2874 if (!PyBytes_Check(output)) {
2875 Py_DECREF(output);
2876 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2877 return 0;
2878 }
2879 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002880 size = PyBytes_GET_SIZE(output);
2881 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002882 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002883 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002884 Py_DECREF(output);
2885 return 0;
2886 }
2887 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002888 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002889}
2890
2891
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002892int
2893PyUnicode_FSDecoder(PyObject* arg, void* addr)
2894{
2895 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002896 if (arg == NULL) {
2897 Py_DECREF(*(PyObject**)addr);
2898 return 1;
2899 }
2900 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901 if (PyUnicode_READY(arg))
2902 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002903 output = arg;
2904 Py_INCREF(output);
2905 }
2906 else {
2907 arg = PyBytes_FromObject(arg);
2908 if (!arg)
2909 return 0;
2910 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2911 PyBytes_GET_SIZE(arg));
2912 Py_DECREF(arg);
2913 if (!output)
2914 return 0;
2915 if (!PyUnicode_Check(output)) {
2916 Py_DECREF(output);
2917 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2918 return 0;
2919 }
2920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2922 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002923 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2924 Py_DECREF(output);
2925 return 0;
2926 }
2927 *(PyObject**)addr = output;
2928 return Py_CLEANUP_SUPPORTED;
2929}
2930
2931
Martin v. Löwis5b222132007-06-10 09:51:05 +00002932char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002934{
Christian Heimesf3863112007-11-22 07:46:41 +00002935 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2937
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002938 if (!PyUnicode_Check(unicode)) {
2939 PyErr_BadArgument();
2940 return NULL;
2941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002943 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002945 if (PyUnicode_UTF8(unicode) == NULL) {
2946 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2948 if (bytes == NULL)
2949 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002950 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2951 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 Py_DECREF(bytes);
2953 return NULL;
2954 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002955 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2956 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 Py_DECREF(bytes);
2958 }
2959
2960 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002961 *psize = PyUnicode_UTF8_LENGTH(unicode);
2962 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002963}
2964
2965char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002968 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2969}
2970
2971#ifdef Py_DEBUG
2972int unicode_as_unicode_calls = 0;
2973#endif
2974
2975
2976Py_UNICODE *
2977PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2978{
2979 PyUnicodeObject *u;
2980 const unsigned char *one_byte;
2981#if SIZEOF_WCHAR_T == 4
2982 const Py_UCS2 *two_bytes;
2983#else
2984 const Py_UCS4 *four_bytes;
2985 const Py_UCS4 *ucs4_end;
2986 Py_ssize_t num_surrogates;
2987#endif
2988 wchar_t *w;
2989 wchar_t *wchar_end;
2990
2991 if (!PyUnicode_Check(unicode)) {
2992 PyErr_BadArgument();
2993 return NULL;
2994 }
2995 u = (PyUnicodeObject*)unicode;
2996 if (_PyUnicode_WSTR(u) == NULL) {
2997 /* Non-ASCII compact unicode object */
2998 assert(_PyUnicode_KIND(u) != 0);
2999 assert(PyUnicode_IS_READY(u));
3000
3001#ifdef Py_DEBUG
3002 ++unicode_as_unicode_calls;
3003#endif
3004
3005 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3006#if SIZEOF_WCHAR_T == 2
3007 four_bytes = PyUnicode_4BYTE_DATA(u);
3008 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3009 num_surrogates = 0;
3010
3011 for (; four_bytes < ucs4_end; ++four_bytes) {
3012 if (*four_bytes > 0xFFFF)
3013 ++num_surrogates;
3014 }
3015
3016 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3017 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3018 if (!_PyUnicode_WSTR(u)) {
3019 PyErr_NoMemory();
3020 return NULL;
3021 }
3022 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3023
3024 w = _PyUnicode_WSTR(u);
3025 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3026 four_bytes = PyUnicode_4BYTE_DATA(u);
3027 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3028 if (*four_bytes > 0xFFFF) {
3029 /* encode surrogate pair in this case */
3030 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3031 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3032 }
3033 else
3034 *w = *four_bytes;
3035
3036 if (w > wchar_end) {
3037 assert(0 && "Miscalculated string end");
3038 }
3039 }
3040 *w = 0;
3041#else
3042 /* sizeof(wchar_t) == 4 */
3043 Py_FatalError("Impossible unicode object state, wstr and str "
3044 "should share memory already.");
3045 return NULL;
3046#endif
3047 }
3048 else {
3049 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3050 (_PyUnicode_LENGTH(u) + 1));
3051 if (!_PyUnicode_WSTR(u)) {
3052 PyErr_NoMemory();
3053 return NULL;
3054 }
3055 if (!PyUnicode_IS_COMPACT_ASCII(u))
3056 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3057 w = _PyUnicode_WSTR(u);
3058 wchar_end = w + _PyUnicode_LENGTH(u);
3059
3060 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3061 one_byte = PyUnicode_1BYTE_DATA(u);
3062 for (; w < wchar_end; ++one_byte, ++w)
3063 *w = *one_byte;
3064 /* null-terminate the wstr */
3065 *w = 0;
3066 }
3067 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3068#if SIZEOF_WCHAR_T == 4
3069 two_bytes = PyUnicode_2BYTE_DATA(u);
3070 for (; w < wchar_end; ++two_bytes, ++w)
3071 *w = *two_bytes;
3072 /* null-terminate the wstr */
3073 *w = 0;
3074#else
3075 /* sizeof(wchar_t) == 2 */
3076 PyObject_FREE(_PyUnicode_WSTR(u));
3077 _PyUnicode_WSTR(u) = NULL;
3078 Py_FatalError("Impossible unicode object state, wstr "
3079 "and str should share memory already.");
3080 return NULL;
3081#endif
3082 }
3083 else {
3084 assert(0 && "This should never happen.");
3085 }
3086 }
3087 }
3088 if (size != NULL)
3089 *size = PyUnicode_WSTR_LENGTH(u);
3090 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003091}
3092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093Py_UNICODE *
3094PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003097}
3098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003099
Alexander Belopolsky40018472011-02-26 01:02:56 +00003100Py_ssize_t
3101PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102{
3103 if (!PyUnicode_Check(unicode)) {
3104 PyErr_BadArgument();
3105 goto onError;
3106 }
3107 return PyUnicode_GET_SIZE(unicode);
3108
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 return -1;
3111}
3112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113Py_ssize_t
3114PyUnicode_GetLength(PyObject *unicode)
3115{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003116 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117 PyErr_BadArgument();
3118 return -1;
3119 }
3120
3121 return PyUnicode_GET_LENGTH(unicode);
3122}
3123
3124Py_UCS4
3125PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3126{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003127 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3128 PyErr_BadArgument();
3129 return (Py_UCS4)-1;
3130 }
3131 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3132 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133 return (Py_UCS4)-1;
3134 }
3135 return PyUnicode_READ_CHAR(unicode, index);
3136}
3137
3138int
3139PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3140{
3141 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003142 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003143 return -1;
3144 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003145 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3146 PyErr_SetString(PyExc_IndexError, "string index out of range");
3147 return -1;
3148 }
3149 if (_PyUnicode_Dirty(unicode))
3150 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003151 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3152 index, ch);
3153 return 0;
3154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156const char *
3157PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003158{
Victor Stinner42cb4622010-09-01 19:39:01 +00003159 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003160}
3161
Victor Stinner554f3f02010-06-16 23:33:54 +00003162/* create or adjust a UnicodeDecodeError */
3163static void
3164make_decode_exception(PyObject **exceptionObject,
3165 const char *encoding,
3166 const char *input, Py_ssize_t length,
3167 Py_ssize_t startpos, Py_ssize_t endpos,
3168 const char *reason)
3169{
3170 if (*exceptionObject == NULL) {
3171 *exceptionObject = PyUnicodeDecodeError_Create(
3172 encoding, input, length, startpos, endpos, reason);
3173 }
3174 else {
3175 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3176 goto onError;
3177 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3178 goto onError;
3179 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3180 goto onError;
3181 }
3182 return;
3183
3184onError:
3185 Py_DECREF(*exceptionObject);
3186 *exceptionObject = NULL;
3187}
3188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189/* error handling callback helper:
3190 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003191 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003192 and adjust various state variables.
3193 return 0 on success, -1 on error
3194*/
3195
Alexander Belopolsky40018472011-02-26 01:02:56 +00003196static int
3197unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003198 const char *encoding, const char *reason,
3199 const char **input, const char **inend, Py_ssize_t *startinpos,
3200 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3201 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003202{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003203 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003204
3205 PyObject *restuple = NULL;
3206 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003207 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003208 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003209 Py_ssize_t requiredsize;
3210 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003211 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003212 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003213 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003214 int res = -1;
3215
3216 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 *errorHandler = PyCodec_LookupError(errors);
3218 if (*errorHandler == NULL)
3219 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003220 }
3221
Victor Stinner554f3f02010-06-16 23:33:54 +00003222 make_decode_exception(exceptionObject,
3223 encoding,
3224 *input, *inend - *input,
3225 *startinpos, *endinpos,
3226 reason);
3227 if (*exceptionObject == NULL)
3228 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003229
3230 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3231 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003234 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 }
3237 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003239
3240 /* Copy back the bytes variables, which might have been modified by the
3241 callback */
3242 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3243 if (!inputobj)
3244 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003245 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003247 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003248 *input = PyBytes_AS_STRING(inputobj);
3249 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003250 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003251 /* we can DECREF safely, as the exception has another reference,
3252 so the object won't go away. */
3253 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003254
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003255 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003256 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003257 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3259 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003260 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003261
3262 /* need more space? (at least enough for what we
3263 have+the replacement+the rest of the string (starting
3264 at the new input position), so we won't have to check space
3265 when there are no errors in the rest of the string) */
3266 repptr = PyUnicode_AS_UNICODE(repunicode);
3267 repsize = PyUnicode_GET_SIZE(repunicode);
3268 requiredsize = *outpos + repsize + insize-newpos;
3269 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 if (requiredsize<2*outsize)
3271 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003272 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003273 goto onError;
3274 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 }
3276 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003277 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003278 Py_UNICODE_COPY(*outptr, repptr, repsize);
3279 *outptr += repsize;
3280 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282 /* we made it! */
3283 res = 0;
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003286 Py_XDECREF(restuple);
3287 return res;
3288}
3289
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003290/* --- UTF-7 Codec -------------------------------------------------------- */
3291
Antoine Pitrou244651a2009-05-04 18:56:13 +00003292/* See RFC2152 for details. We encode conservatively and decode liberally. */
3293
3294/* Three simple macros defining base-64. */
3295
3296/* Is c a base-64 character? */
3297
3298#define IS_BASE64(c) \
3299 (((c) >= 'A' && (c) <= 'Z') || \
3300 ((c) >= 'a' && (c) <= 'z') || \
3301 ((c) >= '0' && (c) <= '9') || \
3302 (c) == '+' || (c) == '/')
3303
3304/* given that c is a base-64 character, what is its base-64 value? */
3305
3306#define FROM_BASE64(c) \
3307 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3308 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3309 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3310 (c) == '+' ? 62 : 63)
3311
3312/* What is the base-64 character of the bottom 6 bits of n? */
3313
3314#define TO_BASE64(n) \
3315 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3316
3317/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3318 * decoded as itself. We are permissive on decoding; the only ASCII
3319 * byte not decoding to itself is the + which begins a base64
3320 * string. */
3321
3322#define DECODE_DIRECT(c) \
3323 ((c) <= 127 && (c) != '+')
3324
3325/* The UTF-7 encoder treats ASCII characters differently according to
3326 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3327 * the above). See RFC2152. This array identifies these different
3328 * sets:
3329 * 0 : "Set D"
3330 * alphanumeric and '(),-./:?
3331 * 1 : "Set O"
3332 * !"#$%&*;<=>@[]^_`{|}
3333 * 2 : "whitespace"
3334 * ht nl cr sp
3335 * 3 : special (must be base64 encoded)
3336 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3337 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003338
Tim Petersced69f82003-09-16 20:30:58 +00003339static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003340char utf7_category[128] = {
3341/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3342 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3343/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3344 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3345/* sp ! " # $ % & ' ( ) * + , - . / */
3346 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3347/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3348 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3349/* @ A B C D E F G H I J K L M N O */
3350 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3351/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3352 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3353/* ` a b c d e f g h i j k l m n o */
3354 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3355/* p q r s t u v w x y z { | } ~ del */
3356 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003357};
3358
Antoine Pitrou244651a2009-05-04 18:56:13 +00003359/* ENCODE_DIRECT: this character should be encoded as itself. The
3360 * answer depends on whether we are encoding set O as itself, and also
3361 * on whether we are encoding whitespace as itself. RFC2152 makes it
3362 * clear that the answers to these questions vary between
3363 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003364
Antoine Pitrou244651a2009-05-04 18:56:13 +00003365#define ENCODE_DIRECT(c, directO, directWS) \
3366 ((c) < 128 && (c) > 0 && \
3367 ((utf7_category[(c)] == 0) || \
3368 (directWS && (utf7_category[(c)] == 2)) || \
3369 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003370
Alexander Belopolsky40018472011-02-26 01:02:56 +00003371PyObject *
3372PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003373 Py_ssize_t size,
3374 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003375{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003376 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3377}
3378
Antoine Pitrou244651a2009-05-04 18:56:13 +00003379/* The decoder. The only state we preserve is our read position,
3380 * i.e. how many characters we have consumed. So if we end in the
3381 * middle of a shift sequence we have to back off the read position
3382 * and the output to the beginning of the sequence, otherwise we lose
3383 * all the shift state (seen bits, number of bits seen, high
3384 * surrogate). */
3385
Alexander Belopolsky40018472011-02-26 01:02:56 +00003386PyObject *
3387PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003388 Py_ssize_t size,
3389 const char *errors,
3390 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003391{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003393 Py_ssize_t startinpos;
3394 Py_ssize_t endinpos;
3395 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003396 const char *e;
3397 PyUnicodeObject *unicode;
3398 Py_UNICODE *p;
3399 const char *errmsg = "";
3400 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003401 Py_UNICODE *shiftOutStart;
3402 unsigned int base64bits = 0;
3403 unsigned long base64buffer = 0;
3404 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 PyObject *errorHandler = NULL;
3406 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003407
3408 unicode = _PyUnicode_New(size);
3409 if (!unicode)
3410 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003411 if (size == 0) {
3412 if (consumed)
3413 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003414 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003418 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003419 e = s + size;
3420
3421 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003422 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003423 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003424 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003425
Antoine Pitrou244651a2009-05-04 18:56:13 +00003426 if (inShift) { /* in a base-64 section */
3427 if (IS_BASE64(ch)) { /* consume a base-64 character */
3428 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3429 base64bits += 6;
3430 s++;
3431 if (base64bits >= 16) {
3432 /* we have enough bits for a UTF-16 value */
3433 Py_UNICODE outCh = (Py_UNICODE)
3434 (base64buffer >> (base64bits-16));
3435 base64bits -= 16;
3436 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3437 if (surrogate) {
3438 /* expecting a second surrogate */
3439 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3440#ifdef Py_UNICODE_WIDE
3441 *p++ = (((surrogate & 0x3FF)<<10)
3442 | (outCh & 0x3FF)) + 0x10000;
3443#else
3444 *p++ = surrogate;
3445 *p++ = outCh;
3446#endif
3447 surrogate = 0;
3448 }
3449 else {
3450 surrogate = 0;
3451 errmsg = "second surrogate missing";
3452 goto utf7Error;
3453 }
3454 }
3455 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3456 /* first surrogate */
3457 surrogate = outCh;
3458 }
3459 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3460 errmsg = "unexpected second surrogate";
3461 goto utf7Error;
3462 }
3463 else {
3464 *p++ = outCh;
3465 }
3466 }
3467 }
3468 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003469 inShift = 0;
3470 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003471 if (surrogate) {
3472 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003473 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003474 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003475 if (base64bits > 0) { /* left-over bits */
3476 if (base64bits >= 6) {
3477 /* We've seen at least one base-64 character */
3478 errmsg = "partial character in shift sequence";
3479 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003480 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003481 else {
3482 /* Some bits remain; they should be zero */
3483 if (base64buffer != 0) {
3484 errmsg = "non-zero padding bits in shift sequence";
3485 goto utf7Error;
3486 }
3487 }
3488 }
3489 if (ch != '-') {
3490 /* '-' is absorbed; other terminating
3491 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003492 *p++ = ch;
3493 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003494 }
3495 }
3496 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003498 s++; /* consume '+' */
3499 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003500 s++;
3501 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003502 }
3503 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003504 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003505 shiftOutStart = p;
3506 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003507 }
3508 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003509 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003510 *p++ = ch;
3511 s++;
3512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003513 else {
3514 startinpos = s-starts;
3515 s++;
3516 errmsg = "unexpected special character";
3517 goto utf7Error;
3518 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003519 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003520utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521 outpos = p-PyUnicode_AS_UNICODE(unicode);
3522 endinpos = s-starts;
3523 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003524 errors, &errorHandler,
3525 "utf7", errmsg,
3526 &starts, &e, &startinpos, &endinpos, &exc, &s,
3527 &unicode, &outpos, &p))
3528 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003529 }
3530
Antoine Pitrou244651a2009-05-04 18:56:13 +00003531 /* end of string */
3532
3533 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3534 /* if we're in an inconsistent state, that's an error */
3535 if (surrogate ||
3536 (base64bits >= 6) ||
3537 (base64bits > 0 && base64buffer != 0)) {
3538 outpos = p-PyUnicode_AS_UNICODE(unicode);
3539 endinpos = size;
3540 if (unicode_decode_call_errorhandler(
3541 errors, &errorHandler,
3542 "utf7", "unterminated shift sequence",
3543 &starts, &e, &startinpos, &endinpos, &exc, &s,
3544 &unicode, &outpos, &p))
3545 goto onError;
3546 if (s < e)
3547 goto restart;
3548 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003549 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003550
3551 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003552 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003553 if (inShift) {
3554 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003555 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003556 }
3557 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003558 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003559 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003561
Victor Stinnerfe226c02011-10-03 03:52:20 +02003562 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003563 goto onError;
3564
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 Py_XDECREF(errorHandler);
3566 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003567 if (PyUnicode_READY(unicode) == -1) {
3568 Py_DECREF(unicode);
3569 return NULL;
3570 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003571 return (PyObject *)unicode;
3572
Benjamin Peterson29060642009-01-31 22:14:21 +00003573 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003574 Py_XDECREF(errorHandler);
3575 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003576 Py_DECREF(unicode);
3577 return NULL;
3578}
3579
3580
Alexander Belopolsky40018472011-02-26 01:02:56 +00003581PyObject *
3582PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003583 Py_ssize_t size,
3584 int base64SetO,
3585 int base64WhiteSpace,
3586 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003587{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003588 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003589 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003590 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003591 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003592 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003593 unsigned int base64bits = 0;
3594 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003595 char * out;
3596 char * start;
3597
3598 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003600
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003601 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003602 return PyErr_NoMemory();
3603
Antoine Pitrou244651a2009-05-04 18:56:13 +00003604 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003605 if (v == NULL)
3606 return NULL;
3607
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003608 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003609 for (;i < size; ++i) {
3610 Py_UNICODE ch = s[i];
3611
Antoine Pitrou244651a2009-05-04 18:56:13 +00003612 if (inShift) {
3613 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3614 /* shifting out */
3615 if (base64bits) { /* output remaining bits */
3616 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3617 base64buffer = 0;
3618 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003619 }
3620 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003621 /* Characters not in the BASE64 set implicitly unshift the sequence
3622 so no '-' is required, except if the character is itself a '-' */
3623 if (IS_BASE64(ch) || ch == '-') {
3624 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003625 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003626 *out++ = (char) ch;
3627 }
3628 else {
3629 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003630 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003631 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003632 else { /* not in a shift sequence */
3633 if (ch == '+') {
3634 *out++ = '+';
3635 *out++ = '-';
3636 }
3637 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3638 *out++ = (char) ch;
3639 }
3640 else {
3641 *out++ = '+';
3642 inShift = 1;
3643 goto encode_char;
3644 }
3645 }
3646 continue;
3647encode_char:
3648#ifdef Py_UNICODE_WIDE
3649 if (ch >= 0x10000) {
3650 /* code first surrogate */
3651 base64bits += 16;
3652 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3653 while (base64bits >= 6) {
3654 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3655 base64bits -= 6;
3656 }
3657 /* prepare second surrogate */
3658 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3659 }
3660#endif
3661 base64bits += 16;
3662 base64buffer = (base64buffer << 16) | ch;
3663 while (base64bits >= 6) {
3664 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3665 base64bits -= 6;
3666 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003667 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668 if (base64bits)
3669 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3670 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003672 if (_PyBytes_Resize(&v, out - start) < 0)
3673 return NULL;
3674 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003675}
3676
Antoine Pitrou244651a2009-05-04 18:56:13 +00003677#undef IS_BASE64
3678#undef FROM_BASE64
3679#undef TO_BASE64
3680#undef DECODE_DIRECT
3681#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683/* --- UTF-8 Codec -------------------------------------------------------- */
3684
Tim Petersced69f82003-09-16 20:30:58 +00003685static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003687 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3688 illegal prefix. See RFC 3629 for details */
3689 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3690 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003691 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3693 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3694 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3695 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003696 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3701 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3702 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3703 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3704 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705};
3706
Alexander Belopolsky40018472011-02-26 01:02:56 +00003707PyObject *
3708PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003709 Py_ssize_t size,
3710 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711{
Walter Dörwald69652032004-09-07 20:24:22 +00003712 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3713}
3714
Antoine Pitrouab868312009-01-10 15:40:25 +00003715/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3716#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3717
3718/* Mask to quickly check whether a C 'long' contains a
3719 non-ASCII, UTF8-encoded char. */
3720#if (SIZEOF_LONG == 8)
3721# define ASCII_CHAR_MASK 0x8080808080808080L
3722#elif (SIZEOF_LONG == 4)
3723# define ASCII_CHAR_MASK 0x80808080L
3724#else
3725# error C 'long' size should be either 4 or 8!
3726#endif
3727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728/* Scans a UTF-8 string and returns the maximum character to be expected,
3729 the size of the decoded unicode string and if any major errors were
3730 encountered.
3731
3732 This function does check basic UTF-8 sanity, it does however NOT CHECK
3733 if the string contains surrogates, and if all continuation bytes are
3734 within the correct ranges, these checks are performed in
3735 PyUnicode_DecodeUTF8Stateful.
3736
3737 If it sets has_errors to 1, it means the value of unicode_size and max_char
3738 will be bogus and you should not rely on useful information in them.
3739 */
3740static Py_UCS4
3741utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3742 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3743 int *has_errors)
3744{
3745 Py_ssize_t n;
3746 Py_ssize_t char_count = 0;
3747 Py_UCS4 max_char = 127, new_max;
3748 Py_UCS4 upper_bound;
3749 const unsigned char *p = (const unsigned char *)s;
3750 const unsigned char *end = p + string_size;
3751 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3752 int err = 0;
3753
3754 for (; p < end && !err; ++p, ++char_count) {
3755 /* Only check value if it's not a ASCII char... */
3756 if (*p < 0x80) {
3757 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3758 an explanation. */
3759 if (!((size_t) p & LONG_PTR_MASK)) {
3760 /* Help register allocation */
3761 register const unsigned char *_p = p;
3762 while (_p < aligned_end) {
3763 unsigned long value = *(unsigned long *) _p;
3764 if (value & ASCII_CHAR_MASK)
3765 break;
3766 _p += SIZEOF_LONG;
3767 char_count += SIZEOF_LONG;
3768 }
3769 p = _p;
3770 if (p == end)
3771 break;
3772 }
3773 }
3774 if (*p >= 0x80) {
3775 n = utf8_code_length[*p];
3776 new_max = max_char;
3777 switch (n) {
3778 /* invalid start byte */
3779 case 0:
3780 err = 1;
3781 break;
3782 case 2:
3783 /* Code points between 0x00FF and 0x07FF inclusive.
3784 Approximate the upper bound of the code point,
3785 if this flips over 255 we can be sure it will be more
3786 than 255 and the string will need 2 bytes per code coint,
3787 if it stays under or equal to 255, we can be sure 1 byte
3788 is enough.
3789 ((*p & 0b00011111) << 6) | 0b00111111 */
3790 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3791 if (max_char < upper_bound)
3792 new_max = upper_bound;
3793 /* Ensure we track at least that we left ASCII space. */
3794 if (new_max < 128)
3795 new_max = 128;
3796 break;
3797 case 3:
3798 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3799 always > 255 and <= 65535 and will always need 2 bytes. */
3800 if (max_char < 65535)
3801 new_max = 65535;
3802 break;
3803 case 4:
3804 /* Code point will be above 0xFFFF for sure in this case. */
3805 new_max = 65537;
3806 break;
3807 /* Internal error, this should be caught by the first if */
3808 case 1:
3809 default:
3810 assert(0 && "Impossible case in utf8_max_char_and_size");
3811 err = 1;
3812 }
3813 /* Instead of number of overall bytes for this code point,
3814 n containts the number of following bytes: */
3815 --n;
3816 /* Check if the follow up chars are all valid continuation bytes */
3817 if (n >= 1) {
3818 const unsigned char *cont;
3819 if ((p + n) >= end) {
3820 if (consumed == 0)
3821 /* incomplete data, non-incremental decoding */
3822 err = 1;
3823 break;
3824 }
3825 for (cont = p + 1; cont < (p + n); ++cont) {
3826 if ((*cont & 0xc0) != 0x80) {
3827 err = 1;
3828 break;
3829 }
3830 }
3831 p += n;
3832 }
3833 else
3834 err = 1;
3835 max_char = new_max;
3836 }
3837 }
3838
3839 if (unicode_size)
3840 *unicode_size = char_count;
3841 if (has_errors)
3842 *has_errors = err;
3843 return max_char;
3844}
3845
3846/* Similar to PyUnicode_WRITE but can also write into wstr field
3847 of the legacy unicode representation */
3848#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3849 do { \
3850 const int k_ = (kind); \
3851 if (k_ == PyUnicode_WCHAR_KIND) \
3852 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3853 else if (k_ == PyUnicode_1BYTE_KIND) \
3854 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3855 else if (k_ == PyUnicode_2BYTE_KIND) \
3856 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3857 else \
3858 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3859 } while (0)
3860
Alexander Belopolsky40018472011-02-26 01:02:56 +00003861PyObject *
3862PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 Py_ssize_t size,
3864 const char *errors,
3865 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003866{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003869 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003870 Py_ssize_t startinpos;
3871 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003872 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003873 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003874 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 PyObject *errorHandler = NULL;
3876 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 Py_UCS4 maxchar = 0;
3878 Py_ssize_t unicode_size;
3879 Py_ssize_t i;
3880 int kind;
3881 void *data;
3882 int has_errors;
3883 Py_UNICODE *error_outptr;
3884#if SIZEOF_WCHAR_T == 2
3885 Py_ssize_t wchar_offset = 0;
3886#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887
Walter Dörwald69652032004-09-07 20:24:22 +00003888 if (size == 0) {
3889 if (consumed)
3890 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003892 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3894 consumed, &has_errors);
3895 if (has_errors) {
3896 unicode = _PyUnicode_New(size);
3897 if (!unicode)
3898 return NULL;
3899 kind = PyUnicode_WCHAR_KIND;
3900 data = PyUnicode_AS_UNICODE(unicode);
3901 assert(data != NULL);
3902 }
3903 else {
3904 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3905 if (!unicode)
3906 return NULL;
3907 /* When the string is ASCII only, just use memcpy and return.
3908 unicode_size may be != size if there is an incomplete UTF-8
3909 sequence at the end of the ASCII block. */
3910 if (maxchar < 128 && size == unicode_size) {
3911 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3912 return (PyObject *)unicode;
3913 }
3914 kind = PyUnicode_KIND(unicode);
3915 data = PyUnicode_DATA(unicode);
3916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003920 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921
3922 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003923 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003924
3925 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003926 /* Fast path for runs of ASCII characters. Given that common UTF-8
3927 input will consist of an overwhelming majority of ASCII
3928 characters, we try to optimize for this case by checking
3929 as many characters as a C 'long' can contain.
3930 First, check if we can do an aligned read, as most CPUs have
3931 a penalty for unaligned reads.
3932 */
3933 if (!((size_t) s & LONG_PTR_MASK)) {
3934 /* Help register allocation */
3935 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003936 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003937 while (_s < aligned_end) {
3938 /* Read a whole long at a time (either 4 or 8 bytes),
3939 and do a fast unrolled copy if it only contains ASCII
3940 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 unsigned long value = *(unsigned long *) _s;
3942 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003943 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3945 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3946 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3947 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003948#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3950 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3951 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3952 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003953#endif
3954 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003956 }
3957 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003959 if (s == e)
3960 break;
3961 ch = (unsigned char)*s;
3962 }
3963 }
3964
3965 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 s++;
3968 continue;
3969 }
3970
3971 n = utf8_code_length[ch];
3972
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003973 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003974 if (consumed)
3975 break;
3976 else {
3977 errmsg = "unexpected end of data";
3978 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003979 endinpos = startinpos+1;
3980 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3981 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 goto utf8Error;
3983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003984 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985
3986 switch (n) {
3987
3988 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003989 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 startinpos = s-starts;
3991 endinpos = startinpos+1;
3992 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993
3994 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003995 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 startinpos = s-starts;
3997 endinpos = startinpos+1;
3998 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999
4000 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004001 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004002 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004004 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 goto utf8Error;
4006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004008 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004009 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 break;
4011
4012 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004013 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4014 will result in surrogates in range d800-dfff. Surrogates are
4015 not valid UTF-8 so they are rejected.
4016 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4017 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004018 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004019 (s[2] & 0xc0) != 0x80 ||
4020 ((unsigned char)s[0] == 0xE0 &&
4021 (unsigned char)s[1] < 0xA0) ||
4022 ((unsigned char)s[0] == 0xED &&
4023 (unsigned char)s[1] > 0x9F)) {
4024 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004026 endinpos = startinpos + 1;
4027
4028 /* if s[1] first two bits are 1 and 0, then the invalid
4029 continuation byte is s[2], so increment endinpos by 1,
4030 if not, s[1] is invalid and endinpos doesn't need to
4031 be incremented. */
4032 if ((s[1] & 0xC0) == 0x80)
4033 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004034 goto utf8Error;
4035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004037 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004039 break;
4040
4041 case 4:
4042 if ((s[1] & 0xc0) != 0x80 ||
4043 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004044 (s[3] & 0xc0) != 0x80 ||
4045 ((unsigned char)s[0] == 0xF0 &&
4046 (unsigned char)s[1] < 0x90) ||
4047 ((unsigned char)s[0] == 0xF4 &&
4048 (unsigned char)s[1] > 0x8F)) {
4049 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004051 endinpos = startinpos + 1;
4052 if ((s[1] & 0xC0) == 0x80) {
4053 endinpos++;
4054 if ((s[2] & 0xC0) == 0x80)
4055 endinpos++;
4056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 goto utf8Error;
4058 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004059 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004060 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4061 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 /* If the string is flexible or we have native UCS-4, write
4064 directly.. */
4065 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4066 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 else {
4069 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 /* translate from 10000..10FFFF to 0..FFFF */
4072 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 /* high surrogate = top 10 bits added to D800 */
4075 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4076 (Py_UNICODE)(0xD800 + (ch >> 10)));
4077
4078 /* low surrogate = bottom 10 bits added to DC00 */
4079 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4080 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4081 }
4082#if SIZEOF_WCHAR_T == 2
4083 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004084#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 }
4087 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004088 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004089
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 /* If this is not yet a resizable string, make it one.. */
4092 if (kind != PyUnicode_WCHAR_KIND) {
4093 const Py_UNICODE *u;
4094 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4095 if (!new_unicode)
4096 goto onError;
4097 u = PyUnicode_AsUnicode((PyObject *)unicode);
4098 if (!u)
4099 goto onError;
4100#if SIZEOF_WCHAR_T == 2
4101 i += wchar_offset;
4102#endif
4103 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4104 Py_DECREF(unicode);
4105 unicode = new_unicode;
4106 kind = 0;
4107 data = PyUnicode_AS_UNICODE(new_unicode);
4108 assert(data != NULL);
4109 }
4110 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 if (unicode_decode_call_errorhandler(
4112 errors, &errorHandler,
4113 "utf8", errmsg,
4114 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004115 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004117 /* Update data because unicode_decode_call_errorhandler might have
4118 re-created or resized the unicode object. */
4119 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004122 /* Ensure the unicode_size calculation above was correct: */
4123 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4124
Walter Dörwald69652032004-09-07 20:24:22 +00004125 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004128 /* Adjust length and ready string when it contained errors and
4129 is of the old resizable kind. */
4130 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004131 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 PyUnicode_READY(unicode) == -1)
4133 goto onError;
4134 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 Py_XDECREF(errorHandler);
4137 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004138 if (PyUnicode_READY(unicode) == -1) {
4139 Py_DECREF(unicode);
4140 return NULL;
4141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142 return (PyObject *)unicode;
4143
Benjamin Peterson29060642009-01-31 22:14:21 +00004144 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004145 Py_XDECREF(errorHandler);
4146 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 Py_DECREF(unicode);
4148 return NULL;
4149}
4150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004151#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004152
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004153#ifdef __APPLE__
4154
4155/* Simplified UTF-8 decoder using surrogateescape error handler,
4156 used to decode the command line arguments on Mac OS X. */
4157
4158wchar_t*
4159_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4160{
4161 int n;
4162 const char *e;
4163 wchar_t *unicode, *p;
4164
4165 /* Note: size will always be longer than the resulting Unicode
4166 character count */
4167 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4168 PyErr_NoMemory();
4169 return NULL;
4170 }
4171 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4172 if (!unicode)
4173 return NULL;
4174
4175 /* Unpack UTF-8 encoded data */
4176 p = unicode;
4177 e = s + size;
4178 while (s < e) {
4179 Py_UCS4 ch = (unsigned char)*s;
4180
4181 if (ch < 0x80) {
4182 *p++ = (wchar_t)ch;
4183 s++;
4184 continue;
4185 }
4186
4187 n = utf8_code_length[ch];
4188 if (s + n > e) {
4189 goto surrogateescape;
4190 }
4191
4192 switch (n) {
4193 case 0:
4194 case 1:
4195 goto surrogateescape;
4196
4197 case 2:
4198 if ((s[1] & 0xc0) != 0x80)
4199 goto surrogateescape;
4200 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4201 assert ((ch > 0x007F) && (ch <= 0x07FF));
4202 *p++ = (wchar_t)ch;
4203 break;
4204
4205 case 3:
4206 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4207 will result in surrogates in range d800-dfff. Surrogates are
4208 not valid UTF-8 so they are rejected.
4209 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4210 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4211 if ((s[1] & 0xc0) != 0x80 ||
4212 (s[2] & 0xc0) != 0x80 ||
4213 ((unsigned char)s[0] == 0xE0 &&
4214 (unsigned char)s[1] < 0xA0) ||
4215 ((unsigned char)s[0] == 0xED &&
4216 (unsigned char)s[1] > 0x9F)) {
4217
4218 goto surrogateescape;
4219 }
4220 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4221 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004223 break;
4224
4225 case 4:
4226 if ((s[1] & 0xc0) != 0x80 ||
4227 (s[2] & 0xc0) != 0x80 ||
4228 (s[3] & 0xc0) != 0x80 ||
4229 ((unsigned char)s[0] == 0xF0 &&
4230 (unsigned char)s[1] < 0x90) ||
4231 ((unsigned char)s[0] == 0xF4 &&
4232 (unsigned char)s[1] > 0x8F)) {
4233 goto surrogateescape;
4234 }
4235 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4236 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4237 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4238
4239#if SIZEOF_WCHAR_T == 4
4240 *p++ = (wchar_t)ch;
4241#else
4242 /* compute and append the two surrogates: */
4243
4244 /* translate from 10000..10FFFF to 0..FFFF */
4245 ch -= 0x10000;
4246
4247 /* high surrogate = top 10 bits added to D800 */
4248 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4249
4250 /* low surrogate = bottom 10 bits added to DC00 */
4251 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4252#endif
4253 break;
4254 }
4255 s += n;
4256 continue;
4257
4258 surrogateescape:
4259 *p++ = 0xDC00 + ch;
4260 s++;
4261 }
4262 *p = L'\0';
4263 return unicode;
4264}
4265
4266#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268/* Primary internal function which creates utf8 encoded bytes objects.
4269
4270 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004271 and allocate exactly as much space needed at the end. Else allocate the
4272 maximum possible needed (4 result bytes per Unicode character), and return
4273 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004274*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004275PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004276_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277{
Tim Peters602f7402002-04-27 18:03:26 +00004278#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004279
Guido van Rossum98297ee2007-11-06 21:34:58 +00004280 Py_ssize_t i; /* index into s of next input byte */
4281 PyObject *result; /* result string object */
4282 char *p; /* next free byte in output buffer */
4283 Py_ssize_t nallocated; /* number of result bytes allocated */
4284 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004285 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004286 PyObject *errorHandler = NULL;
4287 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 int kind;
4289 void *data;
4290 Py_ssize_t size;
4291 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4292#if SIZEOF_WCHAR_T == 2
4293 Py_ssize_t wchar_offset = 0;
4294#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 if (!PyUnicode_Check(unicode)) {
4297 PyErr_BadArgument();
4298 return NULL;
4299 }
4300
4301 if (PyUnicode_READY(unicode) == -1)
4302 return NULL;
4303
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004304 if (PyUnicode_UTF8(unicode))
4305 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4306 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004307
4308 kind = PyUnicode_KIND(unicode);
4309 data = PyUnicode_DATA(unicode);
4310 size = PyUnicode_GET_LENGTH(unicode);
4311
Tim Peters602f7402002-04-27 18:03:26 +00004312 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313
Tim Peters602f7402002-04-27 18:03:26 +00004314 if (size <= MAX_SHORT_UNICHARS) {
4315 /* Write into the stack buffer; nallocated can't overflow.
4316 * At the end, we'll allocate exactly as much heap space as it
4317 * turns out we need.
4318 */
4319 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004320 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004321 p = stackbuf;
4322 }
4323 else {
4324 /* Overallocate on the heap, and give the excess back at the end. */
4325 nallocated = size * 4;
4326 if (nallocated / 4 != size) /* overflow! */
4327 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004328 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004329 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004330 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004331 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004332 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004333
Tim Peters602f7402002-04-27 18:03:26 +00004334 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004335 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004336
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004337 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004338 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004340
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004342 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004343 *p++ = (char)(0xc0 | (ch >> 6));
4344 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004345 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004346 Py_ssize_t newpos;
4347 PyObject *rep;
4348 Py_ssize_t repsize, k, startpos;
4349 startpos = i-1;
4350#if SIZEOF_WCHAR_T == 2
4351 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004352#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 rep = unicode_encode_call_errorhandler(
4354 errors, &errorHandler, "utf-8", "surrogates not allowed",
4355 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4356 &exc, startpos, startpos+1, &newpos);
4357 if (!rep)
4358 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 if (PyBytes_Check(rep))
4361 repsize = PyBytes_GET_SIZE(rep);
4362 else
4363 repsize = PyUnicode_GET_SIZE(rep);
4364
4365 if (repsize > 4) {
4366 Py_ssize_t offset;
4367
4368 if (result == NULL)
4369 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004370 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004371 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4374 /* integer overflow */
4375 PyErr_NoMemory();
4376 goto error;
4377 }
4378 nallocated += repsize - 4;
4379 if (result != NULL) {
4380 if (_PyBytes_Resize(&result, nallocated) < 0)
4381 goto error;
4382 } else {
4383 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004384 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004385 goto error;
4386 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4387 }
4388 p = PyBytes_AS_STRING(result) + offset;
4389 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 if (PyBytes_Check(rep)) {
4392 char *prep = PyBytes_AS_STRING(rep);
4393 for(k = repsize; k > 0; k--)
4394 *p++ = *prep++;
4395 } else /* rep is unicode */ {
4396 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4397 Py_UNICODE c;
4398
4399 for(k=0; k<repsize; k++) {
4400 c = prep[k];
4401 if (0x80 <= c) {
4402 raise_encode_exception(&exc, "utf-8",
4403 PyUnicode_AS_UNICODE(unicode),
4404 size, i-1, i,
4405 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004406 goto error;
4407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004408 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004409 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004411 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004412 } else if (ch < 0x10000) {
4413 *p++ = (char)(0xe0 | (ch >> 12));
4414 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4415 *p++ = (char)(0x80 | (ch & 0x3f));
4416 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004417 /* Encode UCS4 Unicode ordinals */
4418 *p++ = (char)(0xf0 | (ch >> 18));
4419 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4420 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4421 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004422#if SIZEOF_WCHAR_T == 2
4423 wchar_offset++;
4424#endif
Tim Peters602f7402002-04-27 18:03:26 +00004425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004427
Guido van Rossum98297ee2007-11-06 21:34:58 +00004428 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004429 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004430 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004431 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004432 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004433 }
4434 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004435 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004436 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004437 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004438 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004441 Py_XDECREF(errorHandler);
4442 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004443 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004444 error:
4445 Py_XDECREF(errorHandler);
4446 Py_XDECREF(exc);
4447 Py_XDECREF(result);
4448 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004449
Tim Peters602f7402002-04-27 18:03:26 +00004450#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451}
4452
Alexander Belopolsky40018472011-02-26 01:02:56 +00004453PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004454PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4455 Py_ssize_t size,
4456 const char *errors)
4457{
4458 PyObject *v, *unicode;
4459
4460 unicode = PyUnicode_FromUnicode(s, size);
4461 if (unicode == NULL)
4462 return NULL;
4463 v = _PyUnicode_AsUTF8String(unicode, errors);
4464 Py_DECREF(unicode);
4465 return v;
4466}
4467
4468PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004469PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472}
4473
Walter Dörwald41980ca2007-08-16 21:55:45 +00004474/* --- UTF-32 Codec ------------------------------------------------------- */
4475
4476PyObject *
4477PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 Py_ssize_t size,
4479 const char *errors,
4480 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004481{
4482 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4483}
4484
4485PyObject *
4486PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 Py_ssize_t size,
4488 const char *errors,
4489 int *byteorder,
4490 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004491{
4492 const char *starts = s;
4493 Py_ssize_t startinpos;
4494 Py_ssize_t endinpos;
4495 Py_ssize_t outpos;
4496 PyUnicodeObject *unicode;
4497 Py_UNICODE *p;
4498#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004499 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004500 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004501#else
4502 const int pairs = 0;
4503#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004504 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004505 int bo = 0; /* assume native ordering by default */
4506 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004507 /* Offsets from q for retrieving bytes in the right order. */
4508#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4509 int iorder[] = {0, 1, 2, 3};
4510#else
4511 int iorder[] = {3, 2, 1, 0};
4512#endif
4513 PyObject *errorHandler = NULL;
4514 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004515
Walter Dörwald41980ca2007-08-16 21:55:45 +00004516 q = (unsigned char *)s;
4517 e = q + size;
4518
4519 if (byteorder)
4520 bo = *byteorder;
4521
4522 /* Check for BOM marks (U+FEFF) in the input and adjust current
4523 byte order setting accordingly. In native mode, the leading BOM
4524 mark is skipped, in all other modes, it is copied to the output
4525 stream as-is (giving a ZWNBSP character). */
4526 if (bo == 0) {
4527 if (size >= 4) {
4528 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004530#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 if (bom == 0x0000FEFF) {
4532 q += 4;
4533 bo = -1;
4534 }
4535 else if (bom == 0xFFFE0000) {
4536 q += 4;
4537 bo = 1;
4538 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004539#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 if (bom == 0x0000FEFF) {
4541 q += 4;
4542 bo = 1;
4543 }
4544 else if (bom == 0xFFFE0000) {
4545 q += 4;
4546 bo = -1;
4547 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004548#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004550 }
4551
4552 if (bo == -1) {
4553 /* force LE */
4554 iorder[0] = 0;
4555 iorder[1] = 1;
4556 iorder[2] = 2;
4557 iorder[3] = 3;
4558 }
4559 else if (bo == 1) {
4560 /* force BE */
4561 iorder[0] = 3;
4562 iorder[1] = 2;
4563 iorder[2] = 1;
4564 iorder[3] = 0;
4565 }
4566
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004567 /* On narrow builds we split characters outside the BMP into two
4568 codepoints => count how much extra space we need. */
4569#ifndef Py_UNICODE_WIDE
4570 for (qq = q; qq < e; qq += 4)
4571 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4572 pairs++;
4573#endif
4574
4575 /* This might be one to much, because of a BOM */
4576 unicode = _PyUnicode_New((size+3)/4+pairs);
4577 if (!unicode)
4578 return NULL;
4579 if (size == 0)
4580 return (PyObject *)unicode;
4581
4582 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004583 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004584
Walter Dörwald41980ca2007-08-16 21:55:45 +00004585 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 Py_UCS4 ch;
4587 /* remaining bytes at the end? (size should be divisible by 4) */
4588 if (e-q<4) {
4589 if (consumed)
4590 break;
4591 errmsg = "truncated data";
4592 startinpos = ((const char *)q)-starts;
4593 endinpos = ((const char *)e)-starts;
4594 goto utf32Error;
4595 /* The remaining input chars are ignored if the callback
4596 chooses to skip the input */
4597 }
4598 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4599 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004600
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 if (ch >= 0x110000)
4602 {
4603 errmsg = "codepoint not in range(0x110000)";
4604 startinpos = ((const char *)q)-starts;
4605 endinpos = startinpos+4;
4606 goto utf32Error;
4607 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004608#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 if (ch >= 0x10000)
4610 {
4611 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4612 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4613 }
4614 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004615#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 *p++ = ch;
4617 q += 4;
4618 continue;
4619 utf32Error:
4620 outpos = p-PyUnicode_AS_UNICODE(unicode);
4621 if (unicode_decode_call_errorhandler(
4622 errors, &errorHandler,
4623 "utf32", errmsg,
4624 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4625 &unicode, &outpos, &p))
4626 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004627 }
4628
4629 if (byteorder)
4630 *byteorder = bo;
4631
4632 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004633 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004634
4635 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004636 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004637 goto onError;
4638
4639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004641 if (PyUnicode_READY(unicode) == -1) {
4642 Py_DECREF(unicode);
4643 return NULL;
4644 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004645 return (PyObject *)unicode;
4646
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004648 Py_DECREF(unicode);
4649 Py_XDECREF(errorHandler);
4650 Py_XDECREF(exc);
4651 return NULL;
4652}
4653
4654PyObject *
4655PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 Py_ssize_t size,
4657 const char *errors,
4658 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004659{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004661 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004662 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004663#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004664 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004665#else
4666 const int pairs = 0;
4667#endif
4668 /* Offsets from p for storing byte pairs in the right order. */
4669#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4670 int iorder[] = {0, 1, 2, 3};
4671#else
4672 int iorder[] = {3, 2, 1, 0};
4673#endif
4674
Benjamin Peterson29060642009-01-31 22:14:21 +00004675#define STORECHAR(CH) \
4676 do { \
4677 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4678 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4679 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4680 p[iorder[0]] = (CH) & 0xff; \
4681 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004682 } while(0)
4683
4684 /* In narrow builds we can output surrogate pairs as one codepoint,
4685 so we need less space. */
4686#ifndef Py_UNICODE_WIDE
4687 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4689 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4690 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004691#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004692 nsize = (size - pairs + (byteorder == 0));
4693 bytesize = nsize * 4;
4694 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004696 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004697 if (v == NULL)
4698 return NULL;
4699
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004700 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004701 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004702 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004704 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004705
4706 if (byteorder == -1) {
4707 /* force LE */
4708 iorder[0] = 0;
4709 iorder[1] = 1;
4710 iorder[2] = 2;
4711 iorder[3] = 3;
4712 }
4713 else if (byteorder == 1) {
4714 /* force BE */
4715 iorder[0] = 3;
4716 iorder[1] = 2;
4717 iorder[2] = 1;
4718 iorder[3] = 0;
4719 }
4720
4721 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004723#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004724 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4725 Py_UCS4 ch2 = *s;
4726 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4727 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4728 s++;
4729 size--;
4730 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004731 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732#endif
4733 STORECHAR(ch);
4734 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004735
4736 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004738#undef STORECHAR
4739}
4740
Alexander Belopolsky40018472011-02-26 01:02:56 +00004741PyObject *
4742PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004743{
4744 if (!PyUnicode_Check(unicode)) {
4745 PyErr_BadArgument();
4746 return NULL;
4747 }
4748 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004749 PyUnicode_GET_SIZE(unicode),
4750 NULL,
4751 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752}
4753
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754/* --- UTF-16 Codec ------------------------------------------------------- */
4755
Tim Peters772747b2001-08-09 22:21:55 +00004756PyObject *
4757PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004758 Py_ssize_t size,
4759 const char *errors,
4760 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761{
Walter Dörwald69652032004-09-07 20:24:22 +00004762 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4763}
4764
Antoine Pitrouab868312009-01-10 15:40:25 +00004765/* Two masks for fast checking of whether a C 'long' may contain
4766 UTF16-encoded surrogate characters. This is an efficient heuristic,
4767 assuming that non-surrogate characters with a code point >= 0x8000 are
4768 rare in most input.
4769 FAST_CHAR_MASK is used when the input is in native byte ordering,
4770 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004771*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004772#if (SIZEOF_LONG == 8)
4773# define FAST_CHAR_MASK 0x8000800080008000L
4774# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4775#elif (SIZEOF_LONG == 4)
4776# define FAST_CHAR_MASK 0x80008000L
4777# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4778#else
4779# error C 'long' size should be either 4 or 8!
4780#endif
4781
Walter Dörwald69652032004-09-07 20:24:22 +00004782PyObject *
4783PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004784 Py_ssize_t size,
4785 const char *errors,
4786 int *byteorder,
4787 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004788{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004790 Py_ssize_t startinpos;
4791 Py_ssize_t endinpos;
4792 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 PyUnicodeObject *unicode;
4794 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004795 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004796 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004797 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004798 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004799 /* Offsets from q for retrieving byte pairs in the right order. */
4800#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4801 int ihi = 1, ilo = 0;
4802#else
4803 int ihi = 0, ilo = 1;
4804#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 PyObject *errorHandler = NULL;
4806 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
4808 /* Note: size will always be longer than the resulting Unicode
4809 character count */
4810 unicode = _PyUnicode_New(size);
4811 if (!unicode)
4812 return NULL;
4813 if (size == 0)
4814 return (PyObject *)unicode;
4815
4816 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004818 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004819 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
4821 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004822 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004824 /* Check for BOM marks (U+FEFF) in the input and adjust current
4825 byte order setting accordingly. In native mode, the leading BOM
4826 mark is skipped, in all other modes, it is copied to the output
4827 stream as-is (giving a ZWNBSP character). */
4828 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004829 if (size >= 2) {
4830 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004831#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 if (bom == 0xFEFF) {
4833 q += 2;
4834 bo = -1;
4835 }
4836 else if (bom == 0xFFFE) {
4837 q += 2;
4838 bo = 1;
4839 }
Tim Petersced69f82003-09-16 20:30:58 +00004840#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004841 if (bom == 0xFEFF) {
4842 q += 2;
4843 bo = 1;
4844 }
4845 else if (bom == 0xFFFE) {
4846 q += 2;
4847 bo = -1;
4848 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004849#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852
Tim Peters772747b2001-08-09 22:21:55 +00004853 if (bo == -1) {
4854 /* force LE */
4855 ihi = 1;
4856 ilo = 0;
4857 }
4858 else if (bo == 1) {
4859 /* force BE */
4860 ihi = 0;
4861 ilo = 1;
4862 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004863#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4864 native_ordering = ilo < ihi;
4865#else
4866 native_ordering = ilo > ihi;
4867#endif
Tim Peters772747b2001-08-09 22:21:55 +00004868
Antoine Pitrouab868312009-01-10 15:40:25 +00004869 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004870 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004872 /* First check for possible aligned read of a C 'long'. Unaligned
4873 reads are more expensive, better to defer to another iteration. */
4874 if (!((size_t) q & LONG_PTR_MASK)) {
4875 /* Fast path for runs of non-surrogate chars. */
4876 register const unsigned char *_q = q;
4877 Py_UNICODE *_p = p;
4878 if (native_ordering) {
4879 /* Native ordering is simple: as long as the input cannot
4880 possibly contain a surrogate char, do an unrolled copy
4881 of several 16-bit code points to the target object.
4882 The non-surrogate check is done on several input bytes
4883 at a time (as many as a C 'long' can contain). */
4884 while (_q < aligned_end) {
4885 unsigned long data = * (unsigned long *) _q;
4886 if (data & FAST_CHAR_MASK)
4887 break;
4888 _p[0] = ((unsigned short *) _q)[0];
4889 _p[1] = ((unsigned short *) _q)[1];
4890#if (SIZEOF_LONG == 8)
4891 _p[2] = ((unsigned short *) _q)[2];
4892 _p[3] = ((unsigned short *) _q)[3];
4893#endif
4894 _q += SIZEOF_LONG;
4895 _p += SIZEOF_LONG / 2;
4896 }
4897 }
4898 else {
4899 /* Byteswapped ordering is similar, but we must decompose
4900 the copy bytewise, and take care of zero'ing out the
4901 upper bytes if the target object is in 32-bit units
4902 (that is, in UCS-4 builds). */
4903 while (_q < aligned_end) {
4904 unsigned long data = * (unsigned long *) _q;
4905 if (data & SWAPPED_FAST_CHAR_MASK)
4906 break;
4907 /* Zero upper bytes in UCS-4 builds */
4908#if (Py_UNICODE_SIZE > 2)
4909 _p[0] = 0;
4910 _p[1] = 0;
4911#if (SIZEOF_LONG == 8)
4912 _p[2] = 0;
4913 _p[3] = 0;
4914#endif
4915#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004916 /* Issue #4916; UCS-4 builds on big endian machines must
4917 fill the two last bytes of each 4-byte unit. */
4918#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4919# define OFF 2
4920#else
4921# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004922#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004923 ((unsigned char *) _p)[OFF + 1] = _q[0];
4924 ((unsigned char *) _p)[OFF + 0] = _q[1];
4925 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4926 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4927#if (SIZEOF_LONG == 8)
4928 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4929 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4930 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4931 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4932#endif
4933#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004934 _q += SIZEOF_LONG;
4935 _p += SIZEOF_LONG / 2;
4936 }
4937 }
4938 p = _p;
4939 q = _q;
4940 if (q >= e)
4941 break;
4942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944
Benjamin Peterson14339b62009-01-31 16:36:08 +00004945 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004946
4947 if (ch < 0xD800 || ch > 0xDFFF) {
4948 *p++ = ch;
4949 continue;
4950 }
4951
4952 /* UTF-16 code pair: */
4953 if (q > e) {
4954 errmsg = "unexpected end of data";
4955 startinpos = (((const char *)q) - 2) - starts;
4956 endinpos = ((const char *)e) + 1 - starts;
4957 goto utf16Error;
4958 }
4959 if (0xD800 <= ch && ch <= 0xDBFF) {
4960 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4961 q += 2;
4962 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004963#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 *p++ = ch;
4965 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004966#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004968#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 continue;
4970 }
4971 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004972 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 startinpos = (((const char *)q)-4)-starts;
4974 endinpos = startinpos+2;
4975 goto utf16Error;
4976 }
4977
Benjamin Peterson14339b62009-01-31 16:36:08 +00004978 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 errmsg = "illegal encoding";
4980 startinpos = (((const char *)q)-2)-starts;
4981 endinpos = startinpos+2;
4982 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004983
Benjamin Peterson29060642009-01-31 22:14:21 +00004984 utf16Error:
4985 outpos = p - PyUnicode_AS_UNICODE(unicode);
4986 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004987 errors,
4988 &errorHandler,
4989 "utf16", errmsg,
4990 &starts,
4991 (const char **)&e,
4992 &startinpos,
4993 &endinpos,
4994 &exc,
4995 (const char **)&q,
4996 &unicode,
4997 &outpos,
4998 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004999 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005001 /* remaining byte at the end? (size should be even) */
5002 if (e == q) {
5003 if (!consumed) {
5004 errmsg = "truncated data";
5005 startinpos = ((const char *)q) - starts;
5006 endinpos = ((const char *)e) + 1 - starts;
5007 outpos = p - PyUnicode_AS_UNICODE(unicode);
5008 if (unicode_decode_call_errorhandler(
5009 errors,
5010 &errorHandler,
5011 "utf16", errmsg,
5012 &starts,
5013 (const char **)&e,
5014 &startinpos,
5015 &endinpos,
5016 &exc,
5017 (const char **)&q,
5018 &unicode,
5019 &outpos,
5020 &p))
5021 goto onError;
5022 /* The remaining input chars are ignored if the callback
5023 chooses to skip the input */
5024 }
5025 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026
5027 if (byteorder)
5028 *byteorder = bo;
5029
Walter Dörwald69652032004-09-07 20:24:22 +00005030 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005032
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005034 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 goto onError;
5036
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005037 Py_XDECREF(errorHandler);
5038 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005039 if (PyUnicode_READY(unicode) == -1) {
5040 Py_DECREF(unicode);
5041 return NULL;
5042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 return (PyObject *)unicode;
5044
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005047 Py_XDECREF(errorHandler);
5048 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 return NULL;
5050}
5051
Antoine Pitrouab868312009-01-10 15:40:25 +00005052#undef FAST_CHAR_MASK
5053#undef SWAPPED_FAST_CHAR_MASK
5054
Tim Peters772747b2001-08-09 22:21:55 +00005055PyObject *
5056PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 Py_ssize_t size,
5058 const char *errors,
5059 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005061 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005062 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005063 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005064#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005065 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005066#else
5067 const int pairs = 0;
5068#endif
Tim Peters772747b2001-08-09 22:21:55 +00005069 /* Offsets from p for storing byte pairs in the right order. */
5070#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5071 int ihi = 1, ilo = 0;
5072#else
5073 int ihi = 0, ilo = 1;
5074#endif
5075
Benjamin Peterson29060642009-01-31 22:14:21 +00005076#define STORECHAR(CH) \
5077 do { \
5078 p[ihi] = ((CH) >> 8) & 0xff; \
5079 p[ilo] = (CH) & 0xff; \
5080 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005081 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005083#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005084 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 if (s[i] >= 0x10000)
5086 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005087#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005088 /* 2 * (size + pairs + (byteorder == 0)) */
5089 if (size > PY_SSIZE_T_MAX ||
5090 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005092 nsize = size + pairs + (byteorder == 0);
5093 bytesize = nsize * 2;
5094 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005096 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 if (v == NULL)
5098 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005100 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005103 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005104 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005105
5106 if (byteorder == -1) {
5107 /* force LE */
5108 ihi = 1;
5109 ilo = 0;
5110 }
5111 else if (byteorder == 1) {
5112 /* force BE */
5113 ihi = 0;
5114 ilo = 1;
5115 }
5116
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005117 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 Py_UNICODE ch = *s++;
5119 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005120#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 if (ch >= 0x10000) {
5122 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5123 ch = 0xD800 | ((ch-0x10000) >> 10);
5124 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005125#endif
Tim Peters772747b2001-08-09 22:21:55 +00005126 STORECHAR(ch);
5127 if (ch2)
5128 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005130
5131 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005132 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005133#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134}
5135
Alexander Belopolsky40018472011-02-26 01:02:56 +00005136PyObject *
5137PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138{
5139 if (!PyUnicode_Check(unicode)) {
5140 PyErr_BadArgument();
5141 return NULL;
5142 }
5143 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 PyUnicode_GET_SIZE(unicode),
5145 NULL,
5146 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147}
5148
5149/* --- Unicode Escape Codec ----------------------------------------------- */
5150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005151/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5152 if all the escapes in the string make it still a valid ASCII string.
5153 Returns -1 if any escapes were found which cause the string to
5154 pop out of ASCII range. Otherwise returns the length of the
5155 required buffer to hold the string.
5156 */
5157Py_ssize_t
5158length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5159{
5160 const unsigned char *p = (const unsigned char *)s;
5161 const unsigned char *end = p + size;
5162 Py_ssize_t length = 0;
5163
5164 if (size < 0)
5165 return -1;
5166
5167 for (; p < end; ++p) {
5168 if (*p > 127) {
5169 /* Non-ASCII */
5170 return -1;
5171 }
5172 else if (*p != '\\') {
5173 /* Normal character */
5174 ++length;
5175 }
5176 else {
5177 /* Backslash-escape, check next char */
5178 ++p;
5179 /* Escape sequence reaches till end of string or
5180 non-ASCII follow-up. */
5181 if (p >= end || *p > 127)
5182 return -1;
5183 switch (*p) {
5184 case '\n':
5185 /* backslash + \n result in zero characters */
5186 break;
5187 case '\\': case '\'': case '\"':
5188 case 'b': case 'f': case 't':
5189 case 'n': case 'r': case 'v': case 'a':
5190 ++length;
5191 break;
5192 case '0': case '1': case '2': case '3':
5193 case '4': case '5': case '6': case '7':
5194 case 'x': case 'u': case 'U': case 'N':
5195 /* these do not guarantee ASCII characters */
5196 return -1;
5197 default:
5198 /* count the backslash + the other character */
5199 length += 2;
5200 }
5201 }
5202 }
5203 return length;
5204}
5205
5206/* Similar to PyUnicode_WRITE but either write into wstr field
5207 or treat string as ASCII. */
5208#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5209 do { \
5210 if ((kind) != PyUnicode_WCHAR_KIND) \
5211 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5212 else \
5213 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5214 } while (0)
5215
5216#define WRITE_WSTR(buf, index, value) \
5217 assert(kind == PyUnicode_WCHAR_KIND), \
5218 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5219
5220
Fredrik Lundh06d12682001-01-24 07:59:11 +00005221static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005222
Alexander Belopolsky40018472011-02-26 01:02:56 +00005223PyObject *
5224PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005225 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005226 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005228 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t startinpos;
5230 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005231 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005233 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005235 char* message;
5236 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 PyObject *errorHandler = NULL;
5238 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005239 Py_ssize_t ascii_length;
5240 Py_ssize_t i;
5241 int kind;
5242 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005244 ascii_length = length_of_escaped_ascii_string(s, size);
5245
5246 /* After length_of_escaped_ascii_string() there are two alternatives,
5247 either the string is pure ASCII with named escapes like \n, etc.
5248 and we determined it's exact size (common case)
5249 or it contains \x, \u, ... escape sequences. then we create a
5250 legacy wchar string and resize it at the end of this function. */
5251 if (ascii_length >= 0) {
5252 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5253 if (!v)
5254 goto onError;
5255 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5256 kind = PyUnicode_1BYTE_KIND;
5257 data = PyUnicode_DATA(v);
5258 }
5259 else {
5260 /* Escaped strings will always be longer than the resulting
5261 Unicode string, so we start with size here and then reduce the
5262 length after conversion to the true value.
5263 (but if the error callback returns a long replacement string
5264 we'll have to allocate more space) */
5265 v = _PyUnicode_New(size);
5266 if (!v)
5267 goto onError;
5268 kind = PyUnicode_WCHAR_KIND;
5269 data = PyUnicode_AS_UNICODE(v);
5270 }
5271
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 if (size == 0)
5273 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005274 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005276
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 while (s < end) {
5278 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005279 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005282 if (kind == PyUnicode_WCHAR_KIND) {
5283 assert(i < _PyUnicode_WSTR_LENGTH(v));
5284 }
5285 else {
5286 /* The only case in which i == ascii_length is a backslash
5287 followed by a newline. */
5288 assert(i <= ascii_length);
5289 }
5290
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 /* Non-escape characters are interpreted as Unicode ordinals */
5292 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005293 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 continue;
5295 }
5296
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005297 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 /* \ - Escapes */
5299 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005300 c = *s++;
5301 if (s > end)
5302 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005303
5304 if (kind == PyUnicode_WCHAR_KIND) {
5305 assert(i < _PyUnicode_WSTR_LENGTH(v));
5306 }
5307 else {
5308 /* The only case in which i == ascii_length is a backslash
5309 followed by a newline. */
5310 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5311 }
5312
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005313 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005317 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5318 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5319 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5320 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5321 /* FF */
5322 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5323 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5324 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5325 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5326 /* VT */
5327 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5328 /* BEL, not classic C */
5329 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 case '0': case '1': case '2': case '3':
5333 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005334 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005335 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005336 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005337 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005338 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005340 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341 break;
5342
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 /* hex escapes */
5344 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005346 digits = 2;
5347 message = "truncated \\xXX escape";
5348 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005352 digits = 4;
5353 message = "truncated \\uXXXX escape";
5354 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005357 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005358 digits = 8;
5359 message = "truncated \\UXXXXXXXX escape";
5360 hexescape:
5361 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 if (s+digits>end) {
5364 endinpos = size;
5365 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 errors, &errorHandler,
5367 "unicodeescape", "end of string in escape sequence",
5368 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 goto nextByte;
5373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 for (j = 0; j < digits; ++j) {
5375 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005376 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005377 endinpos = (s+j+1)-starts;
5378 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005379 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 errors, &errorHandler,
5381 "unicodeescape", message,
5382 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005384 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005385 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005387 }
5388 chr = (chr<<4) & ~0xF;
5389 if (c >= '0' && c <= '9')
5390 chr += c - '0';
5391 else if (c >= 'a' && c <= 'f')
5392 chr += 10 + c - 'a';
5393 else
5394 chr += 10 + c - 'A';
5395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005397 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005398 /* _decoding_error will have already written into the
5399 target buffer. */
5400 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005401 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005402 /* when we get here, chr is a 32-bit unicode character */
5403 if (chr <= 0xffff)
5404 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005406 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005407 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005408 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005409#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005410 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005411#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005412 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005413 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5414 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005415#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005416 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005419 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005420 errors, &errorHandler,
5421 "unicodeescape", "illegal Unicode character",
5422 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005423 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005424 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005425 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005426 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005427 break;
5428
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005430 case 'N':
5431 message = "malformed \\N character escape";
5432 if (ucnhash_CAPI == NULL) {
5433 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005434 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5435 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005436 if (ucnhash_CAPI == NULL)
5437 goto ucnhashError;
5438 }
5439 if (*s == '{') {
5440 const char *start = s+1;
5441 /* look for the closing brace */
5442 while (*s != '}' && s < end)
5443 s++;
5444 if (s > start && s < end && *s == '}') {
5445 /* found a name. look it up in the unicode database */
5446 message = "unknown Unicode character name";
5447 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5449 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005450 goto store;
5451 }
5452 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005454 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005455 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 errors, &errorHandler,
5457 "unicodeescape", message,
5458 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005459 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005460 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005461 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005462 break;
5463
5464 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005465 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005466 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005467 message = "\\ at end of string";
5468 s--;
5469 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005471 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 errors, &errorHandler,
5473 "unicodeescape", message,
5474 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005475 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005476 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005478 }
5479 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005480 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5481 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005482 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005483 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005486 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 /* Ensure the length prediction worked in case of ASCII strings */
5489 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5490
Victor Stinnerfe226c02011-10-03 03:52:20 +02005491 if (kind == PyUnicode_WCHAR_KIND)
5492 {
5493 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5494 goto onError;
5495 if (PyUnicode_READY(v) == -1)
5496 goto onError;
5497 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005498 Py_XDECREF(errorHandler);
5499 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005501
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005503 PyErr_SetString(
5504 PyExc_UnicodeError,
5505 "\\N escapes not supported (can't load unicodedata module)"
5506 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005507 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 Py_XDECREF(errorHandler);
5509 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005510 return NULL;
5511
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514 Py_XDECREF(errorHandler);
5515 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 return NULL;
5517}
5518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005519#undef WRITE_ASCII_OR_WSTR
5520#undef WRITE_WSTR
5521
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522/* Return a Unicode-Escape string version of the Unicode object.
5523
5524 If quotes is true, the string is enclosed in u"" or u'' quotes as
5525 appropriate.
5526
5527*/
5528
Walter Dörwald79e913e2007-05-12 11:08:06 +00005529static const char *hexdigits = "0123456789abcdef";
5530
Alexander Belopolsky40018472011-02-26 01:02:56 +00005531PyObject *
5532PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005535 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005538#ifdef Py_UNICODE_WIDE
5539 const Py_ssize_t expandsize = 10;
5540#else
5541 const Py_ssize_t expandsize = 6;
5542#endif
5543
Thomas Wouters89f507f2006-12-13 04:49:30 +00005544 /* XXX(nnorwitz): rather than over-allocating, it would be
5545 better to choose a different scheme. Perhaps scan the
5546 first N-chars of the string and allocate based on that size.
5547 */
5548 /* Initial allocation is based on the longest-possible unichr
5549 escape.
5550
5551 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5552 unichr, so in this case it's the longest unichr escape. In
5553 narrow (UTF-16) builds this is five chars per source unichr
5554 since there are two unichrs in the surrogate pair, so in narrow
5555 (UTF-16) builds it's not the longest unichr escape.
5556
5557 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5558 so in the narrow (UTF-16) build case it's the longest unichr
5559 escape.
5560 */
5561
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005562 if (size == 0)
5563 return PyBytes_FromStringAndSize(NULL, 0);
5564
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005565 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005567
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005568 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 2
5570 + expandsize*size
5571 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (repr == NULL)
5573 return NULL;
5574
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005575 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 while (size-- > 0) {
5578 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005579
Walter Dörwald79e913e2007-05-12 11:08:06 +00005580 /* Escape backslashes */
5581 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 *p++ = '\\';
5583 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005584 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005585 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005586
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005587#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005588 /* Map 21-bit characters to '\U00xxxxxx' */
5589 else if (ch >= 0x10000) {
5590 *p++ = '\\';
5591 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005592 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5593 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5594 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5595 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5596 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5597 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5598 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5599 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005601 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005602#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5604 else if (ch >= 0xD800 && ch < 0xDC00) {
5605 Py_UNICODE ch2;
5606 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005607
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 ch2 = *s++;
5609 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005610 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5612 *p++ = '\\';
5613 *p++ = 'U';
5614 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5615 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5616 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5617 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5618 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5619 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5620 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5621 *p++ = hexdigits[ucs & 0x0000000F];
5622 continue;
5623 }
5624 /* Fall through: isolated surrogates are copied as-is */
5625 s--;
5626 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005627 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005628#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005629
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005631 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 *p++ = '\\';
5633 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005634 *p++ = hexdigits[(ch >> 12) & 0x000F];
5635 *p++ = hexdigits[(ch >> 8) & 0x000F];
5636 *p++ = hexdigits[(ch >> 4) & 0x000F];
5637 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005639
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005640 /* Map special whitespace to '\t', \n', '\r' */
5641 else if (ch == '\t') {
5642 *p++ = '\\';
5643 *p++ = 't';
5644 }
5645 else if (ch == '\n') {
5646 *p++ = '\\';
5647 *p++ = 'n';
5648 }
5649 else if (ch == '\r') {
5650 *p++ = '\\';
5651 *p++ = 'r';
5652 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005653
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005654 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005655 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005656 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005657 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005658 *p++ = hexdigits[(ch >> 4) & 0x000F];
5659 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005660 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Copy everything else as-is */
5663 else
5664 *p++ = (char) ch;
5665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005667 assert(p - PyBytes_AS_STRING(repr) > 0);
5668 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5669 return NULL;
5670 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671}
5672
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673PyObject *
5674PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005676 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (!PyUnicode_Check(unicode)) {
5678 PyErr_BadArgument();
5679 return NULL;
5680 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005681 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5682 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005683 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684}
5685
5686/* --- Raw Unicode Escape Codec ------------------------------------------- */
5687
Alexander Belopolsky40018472011-02-26 01:02:56 +00005688PyObject *
5689PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005690 Py_ssize_t size,
5691 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005693 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005694 Py_ssize_t startinpos;
5695 Py_ssize_t endinpos;
5696 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 const char *end;
5700 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 PyObject *errorHandler = NULL;
5702 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 /* Escaped strings will always be longer than the resulting
5705 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 length after conversion to the true value. (But decoding error
5707 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 v = _PyUnicode_New(size);
5709 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 end = s + size;
5715 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 unsigned char c;
5717 Py_UCS4 x;
5718 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005719 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 /* Non-escape characters are interpreted as Unicode ordinals */
5722 if (*s != '\\') {
5723 *p++ = (unsigned char)*s++;
5724 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 startinpos = s-starts;
5727
5728 /* \u-escapes are only interpreted iff the number of leading
5729 backslashes if odd */
5730 bs = s;
5731 for (;s < end;) {
5732 if (*s != '\\')
5733 break;
5734 *p++ = (unsigned char)*s++;
5735 }
5736 if (((s - bs) & 1) == 0 ||
5737 s >= end ||
5738 (*s != 'u' && *s != 'U')) {
5739 continue;
5740 }
5741 p--;
5742 count = *s=='u' ? 4 : 8;
5743 s++;
5744
5745 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5746 outpos = p-PyUnicode_AS_UNICODE(v);
5747 for (x = 0, i = 0; i < count; ++i, ++s) {
5748 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005749 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 endinpos = s-starts;
5751 if (unicode_decode_call_errorhandler(
5752 errors, &errorHandler,
5753 "rawunicodeescape", "truncated \\uXXXX",
5754 &starts, &end, &startinpos, &endinpos, &exc, &s,
5755 &v, &outpos, &p))
5756 goto onError;
5757 goto nextByte;
5758 }
5759 x = (x<<4) & ~0xF;
5760 if (c >= '0' && c <= '9')
5761 x += c - '0';
5762 else if (c >= 'a' && c <= 'f')
5763 x += 10 + c - 'a';
5764 else
5765 x += 10 + c - 'A';
5766 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005767 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* UCS-2 character */
5769 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005770 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 /* UCS-4 character. Either store directly, or as
5772 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005773#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005775#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 x -= 0x10000L;
5777 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5778 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005779#endif
5780 } else {
5781 endinpos = s-starts;
5782 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005783 if (unicode_decode_call_errorhandler(
5784 errors, &errorHandler,
5785 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 &starts, &end, &startinpos, &endinpos, &exc, &s,
5787 &v, &outpos, &p))
5788 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005789 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 nextByte:
5791 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005793 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005795 Py_XDECREF(errorHandler);
5796 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 if (PyUnicode_READY(v) == -1) {
5798 Py_DECREF(v);
5799 return NULL;
5800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005802
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 Py_XDECREF(errorHandler);
5806 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807 return NULL;
5808}
5809
Alexander Belopolsky40018472011-02-26 01:02:56 +00005810PyObject *
5811PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005812 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005814 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 char *p;
5816 char *q;
5817
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005818#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005819 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005820#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005821 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005822#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005823
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005824 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005827 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 if (repr == NULL)
5829 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005830 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005833 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 while (size-- > 0) {
5835 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005836#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 /* Map 32-bit characters to '\Uxxxxxxxx' */
5838 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005839 *p++ = '\\';
5840 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005841 *p++ = hexdigits[(ch >> 28) & 0xf];
5842 *p++ = hexdigits[(ch >> 24) & 0xf];
5843 *p++ = hexdigits[(ch >> 20) & 0xf];
5844 *p++ = hexdigits[(ch >> 16) & 0xf];
5845 *p++ = hexdigits[(ch >> 12) & 0xf];
5846 *p++ = hexdigits[(ch >> 8) & 0xf];
5847 *p++ = hexdigits[(ch >> 4) & 0xf];
5848 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005849 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005850 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005851#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5853 if (ch >= 0xD800 && ch < 0xDC00) {
5854 Py_UNICODE ch2;
5855 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005856
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 ch2 = *s++;
5858 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005859 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5861 *p++ = '\\';
5862 *p++ = 'U';
5863 *p++ = hexdigits[(ucs >> 28) & 0xf];
5864 *p++ = hexdigits[(ucs >> 24) & 0xf];
5865 *p++ = hexdigits[(ucs >> 20) & 0xf];
5866 *p++ = hexdigits[(ucs >> 16) & 0xf];
5867 *p++ = hexdigits[(ucs >> 12) & 0xf];
5868 *p++ = hexdigits[(ucs >> 8) & 0xf];
5869 *p++ = hexdigits[(ucs >> 4) & 0xf];
5870 *p++ = hexdigits[ucs & 0xf];
5871 continue;
5872 }
5873 /* Fall through: isolated surrogates are copied as-is */
5874 s--;
5875 size++;
5876 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005877#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 /* Map 16-bit characters to '\uxxxx' */
5879 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 *p++ = '\\';
5881 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005882 *p++ = hexdigits[(ch >> 12) & 0xf];
5883 *p++ = hexdigits[(ch >> 8) & 0xf];
5884 *p++ = hexdigits[(ch >> 4) & 0xf];
5885 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 /* Copy everything else as-is */
5888 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889 *p++ = (char) ch;
5890 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005891 size = p - q;
5892
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 assert(size > 0);
5894 if (_PyBytes_Resize(&repr, size) < 0)
5895 return NULL;
5896 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
Alexander Belopolsky40018472011-02-26 01:02:56 +00005899PyObject *
5900PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005902 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005904 PyErr_BadArgument();
5905 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005907 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5908 PyUnicode_GET_SIZE(unicode));
5909
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005910 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911}
5912
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005913/* --- Unicode Internal Codec ------------------------------------------- */
5914
Alexander Belopolsky40018472011-02-26 01:02:56 +00005915PyObject *
5916_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005917 Py_ssize_t size,
5918 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005919{
5920 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t startinpos;
5922 Py_ssize_t endinpos;
5923 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005924 PyUnicodeObject *v;
5925 Py_UNICODE *p;
5926 const char *end;
5927 const char *reason;
5928 PyObject *errorHandler = NULL;
5929 PyObject *exc = NULL;
5930
Neal Norwitzd43069c2006-01-08 01:12:10 +00005931#ifdef Py_UNICODE_WIDE
5932 Py_UNICODE unimax = PyUnicode_GetMax();
5933#endif
5934
Thomas Wouters89f507f2006-12-13 04:49:30 +00005935 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005936 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5937 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5940 as string was created with the old API. */
5941 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005943 p = PyUnicode_AS_UNICODE(v);
5944 end = s + size;
5945
5946 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005947 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005948 /* We have to sanity check the raw data, otherwise doom looms for
5949 some malformed UCS-4 data. */
5950 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005951#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005952 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005953#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005954 end-s < Py_UNICODE_SIZE
5955 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005957 startinpos = s - starts;
5958 if (end-s < Py_UNICODE_SIZE) {
5959 endinpos = end-starts;
5960 reason = "truncated input";
5961 }
5962 else {
5963 endinpos = s - starts + Py_UNICODE_SIZE;
5964 reason = "illegal code point (> 0x10FFFF)";
5965 }
5966 outpos = p - PyUnicode_AS_UNICODE(v);
5967 if (unicode_decode_call_errorhandler(
5968 errors, &errorHandler,
5969 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005970 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005971 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005972 goto onError;
5973 }
5974 }
5975 else {
5976 p++;
5977 s += Py_UNICODE_SIZE;
5978 }
5979 }
5980
Victor Stinnerfe226c02011-10-03 03:52:20 +02005981 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 goto onError;
5983 Py_XDECREF(errorHandler);
5984 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985 if (PyUnicode_READY(v) == -1) {
5986 Py_DECREF(v);
5987 return NULL;
5988 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 return (PyObject *)v;
5990
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005992 Py_XDECREF(v);
5993 Py_XDECREF(errorHandler);
5994 Py_XDECREF(exc);
5995 return NULL;
5996}
5997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998/* --- Latin-1 Codec ------------------------------------------------------ */
5999
Alexander Belopolsky40018472011-02-26 01:02:56 +00006000PyObject *
6001PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006002 Py_ssize_t size,
6003 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006006 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007}
6008
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006010static void
6011make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006012 const char *encoding,
6013 const Py_UNICODE *unicode, Py_ssize_t size,
6014 Py_ssize_t startpos, Py_ssize_t endpos,
6015 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 *exceptionObject = PyUnicodeEncodeError_Create(
6019 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
6021 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6023 goto onError;
6024 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6025 goto onError;
6026 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6027 goto onError;
6028 return;
6029 onError:
6030 Py_DECREF(*exceptionObject);
6031 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 }
6033}
6034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006036static void
6037raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006038 const char *encoding,
6039 const Py_UNICODE *unicode, Py_ssize_t size,
6040 Py_ssize_t startpos, Py_ssize_t endpos,
6041 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042{
6043 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047}
6048
6049/* error handling callback helper:
6050 build arguments, call the callback and check the arguments,
6051 put the result into newpos and return the replacement string, which
6052 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006053static PyObject *
6054unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006055 PyObject **errorHandler,
6056 const char *encoding, const char *reason,
6057 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6058 Py_ssize_t startpos, Py_ssize_t endpos,
6059 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006061 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062
6063 PyObject *restuple;
6064 PyObject *resunicode;
6065
6066 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006068 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 }
6071
6072 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076
6077 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006082 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 Py_DECREF(restuple);
6084 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006086 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 &resunicode, newpos)) {
6088 Py_DECREF(restuple);
6089 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006090 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006091 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6092 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6093 Py_DECREF(restuple);
6094 return NULL;
6095 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006098 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6100 Py_DECREF(restuple);
6101 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103 Py_INCREF(resunicode);
6104 Py_DECREF(restuple);
6105 return resunicode;
6106}
6107
Alexander Belopolsky40018472011-02-26 01:02:56 +00006108static PyObject *
6109unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006110 Py_ssize_t size,
6111 const char *errors,
6112 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113{
6114 /* output object */
6115 PyObject *res;
6116 /* pointers to the beginning and end+1 of input */
6117 const Py_UNICODE *startp = p;
6118 const Py_UNICODE *endp = p + size;
6119 /* pointer to the beginning of the unencodable characters */
6120 /* const Py_UNICODE *badp = NULL; */
6121 /* pointer into the output */
6122 char *str;
6123 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006124 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006125 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6126 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 PyObject *errorHandler = NULL;
6128 PyObject *exc = NULL;
6129 /* the following variable is used for caching string comparisons
6130 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6131 int known_errorHandler = -1;
6132
6133 /* allocate enough for a simple encoding without
6134 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006135 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006136 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006137 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006139 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006140 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 ressize = size;
6142
6143 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145
Benjamin Peterson29060642009-01-31 22:14:21 +00006146 /* can we encode this? */
6147 if (c<limit) {
6148 /* no overflow check, because we know that the space is enough */
6149 *str++ = (char)c;
6150 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006151 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 else {
6153 Py_ssize_t unicodepos = p-startp;
6154 Py_ssize_t requiredsize;
6155 PyObject *repunicode;
6156 Py_ssize_t repsize;
6157 Py_ssize_t newpos;
6158 Py_ssize_t respos;
6159 Py_UNICODE *uni2;
6160 /* startpos for collecting unencodable chars */
6161 const Py_UNICODE *collstart = p;
6162 const Py_UNICODE *collend = p;
6163 /* find all unecodable characters */
6164 while ((collend < endp) && ((*collend)>=limit))
6165 ++collend;
6166 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6167 if (known_errorHandler==-1) {
6168 if ((errors==NULL) || (!strcmp(errors, "strict")))
6169 known_errorHandler = 1;
6170 else if (!strcmp(errors, "replace"))
6171 known_errorHandler = 2;
6172 else if (!strcmp(errors, "ignore"))
6173 known_errorHandler = 3;
6174 else if (!strcmp(errors, "xmlcharrefreplace"))
6175 known_errorHandler = 4;
6176 else
6177 known_errorHandler = 0;
6178 }
6179 switch (known_errorHandler) {
6180 case 1: /* strict */
6181 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6182 goto onError;
6183 case 2: /* replace */
6184 while (collstart++<collend)
6185 *str++ = '?'; /* fall through */
6186 case 3: /* ignore */
6187 p = collend;
6188 break;
6189 case 4: /* xmlcharrefreplace */
6190 respos = str - PyBytes_AS_STRING(res);
6191 /* determine replacement size (temporarily (mis)uses p) */
6192 for (p = collstart, repsize = 0; p < collend; ++p) {
6193 if (*p<10)
6194 repsize += 2+1+1;
6195 else if (*p<100)
6196 repsize += 2+2+1;
6197 else if (*p<1000)
6198 repsize += 2+3+1;
6199 else if (*p<10000)
6200 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006201#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 else
6203 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006204#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 else if (*p<100000)
6206 repsize += 2+5+1;
6207 else if (*p<1000000)
6208 repsize += 2+6+1;
6209 else
6210 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006211#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 }
6213 requiredsize = respos+repsize+(endp-collend);
6214 if (requiredsize > ressize) {
6215 if (requiredsize<2*ressize)
6216 requiredsize = 2*ressize;
6217 if (_PyBytes_Resize(&res, requiredsize))
6218 goto onError;
6219 str = PyBytes_AS_STRING(res) + respos;
6220 ressize = requiredsize;
6221 }
6222 /* generate replacement (temporarily (mis)uses p) */
6223 for (p = collstart; p < collend; ++p) {
6224 str += sprintf(str, "&#%d;", (int)*p);
6225 }
6226 p = collend;
6227 break;
6228 default:
6229 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6230 encoding, reason, startp, size, &exc,
6231 collstart-startp, collend-startp, &newpos);
6232 if (repunicode == NULL)
6233 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006234 if (PyBytes_Check(repunicode)) {
6235 /* Directly copy bytes result to output. */
6236 repsize = PyBytes_Size(repunicode);
6237 if (repsize > 1) {
6238 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006239 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006240 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6241 Py_DECREF(repunicode);
6242 goto onError;
6243 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006244 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006245 ressize += repsize-1;
6246 }
6247 memcpy(str, PyBytes_AsString(repunicode), repsize);
6248 str += repsize;
6249 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006250 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006251 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 /* need more space? (at least enough for what we
6254 have+the replacement+the rest of the string, so
6255 we won't have to check space for encodable characters) */
6256 respos = str - PyBytes_AS_STRING(res);
6257 repsize = PyUnicode_GET_SIZE(repunicode);
6258 requiredsize = respos+repsize+(endp-collend);
6259 if (requiredsize > ressize) {
6260 if (requiredsize<2*ressize)
6261 requiredsize = 2*ressize;
6262 if (_PyBytes_Resize(&res, requiredsize)) {
6263 Py_DECREF(repunicode);
6264 goto onError;
6265 }
6266 str = PyBytes_AS_STRING(res) + respos;
6267 ressize = requiredsize;
6268 }
6269 /* check if there is anything unencodable in the replacement
6270 and copy it to the output */
6271 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6272 c = *uni2;
6273 if (c >= limit) {
6274 raise_encode_exception(&exc, encoding, startp, size,
6275 unicodepos, unicodepos+1, reason);
6276 Py_DECREF(repunicode);
6277 goto onError;
6278 }
6279 *str = (char)c;
6280 }
6281 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006282 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006284 }
6285 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006286 /* Resize if we allocated to much */
6287 size = str - PyBytes_AS_STRING(res);
6288 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006289 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006290 if (_PyBytes_Resize(&res, size) < 0)
6291 goto onError;
6292 }
6293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294 Py_XDECREF(errorHandler);
6295 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006296 return res;
6297
6298 onError:
6299 Py_XDECREF(res);
6300 Py_XDECREF(errorHandler);
6301 Py_XDECREF(exc);
6302 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
6306PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006307 Py_ssize_t size,
6308 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311}
6312
Alexander Belopolsky40018472011-02-26 01:02:56 +00006313PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006314_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315{
6316 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 PyErr_BadArgument();
6318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006320 if (PyUnicode_READY(unicode) == -1)
6321 return NULL;
6322 /* Fast path: if it is a one-byte string, construct
6323 bytes object directly. */
6324 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6325 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6326 PyUnicode_GET_LENGTH(unicode));
6327 /* Non-Latin-1 characters present. Defer to above function to
6328 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006331 errors);
6332}
6333
6334PyObject*
6335PyUnicode_AsLatin1String(PyObject *unicode)
6336{
6337 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006338}
6339
6340/* --- 7-bit ASCII Codec -------------------------------------------------- */
6341
Alexander Belopolsky40018472011-02-26 01:02:56 +00006342PyObject *
6343PyUnicode_DecodeASCII(const char *s,
6344 Py_ssize_t size,
6345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 PyUnicodeObject *v;
6349 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006350 Py_ssize_t startinpos;
6351 Py_ssize_t endinpos;
6352 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006354 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355 PyObject *errorHandler = NULL;
6356 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006357 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006360 if (size == 1 && *(unsigned char*)s < 128)
6361 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6362
6363 /* Fast path. Assume the input actually *is* ASCII, and allocate
6364 a single-block Unicode object with that assumption. If there is
6365 an error, drop the object and start over. */
6366 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6367 if (v == NULL)
6368 goto onError;
6369 d = PyUnicode_1BYTE_DATA(v);
6370 for (i = 0; i < size; i++) {
6371 unsigned char ch = ((unsigned char*)s)[i];
6372 if (ch < 128)
6373 d[i] = ch;
6374 else
6375 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006377 if (i == size)
6378 return (PyObject*)v;
6379 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006380
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 v = _PyUnicode_New(size);
6382 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387 e = s + size;
6388 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 register unsigned char c = (unsigned char)*s;
6390 if (c < 128) {
6391 *p++ = c;
6392 ++s;
6393 }
6394 else {
6395 startinpos = s-starts;
6396 endinpos = startinpos + 1;
6397 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6398 if (unicode_decode_call_errorhandler(
6399 errors, &errorHandler,
6400 "ascii", "ordinal not in range(128)",
6401 &starts, &e, &startinpos, &endinpos, &exc, &s,
6402 &v, &outpos, &p))
6403 goto onError;
6404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006405 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006406 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006407 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 Py_XDECREF(errorHandler);
6410 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006411 if (PyUnicode_READY(v) == -1) {
6412 Py_DECREF(v);
6413 return NULL;
6414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 Py_XDECREF(errorHandler);
6420 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 return NULL;
6422}
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
6425PyUnicode_EncodeASCII(const Py_UNICODE *p,
6426 Py_ssize_t size,
6427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
Alexander Belopolsky40018472011-02-26 01:02:56 +00006432PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006433_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
6435 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 PyErr_BadArgument();
6437 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006439 if (PyUnicode_READY(unicode) == -1)
6440 return NULL;
6441 /* Fast path: if it is an ASCII-only string, construct bytes object
6442 directly. Else defer to above function to raise the exception. */
6443 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6444 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6445 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006448 errors);
6449}
6450
6451PyObject *
6452PyUnicode_AsASCIIString(PyObject *unicode)
6453{
6454 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455}
6456
Victor Stinner99b95382011-07-04 14:23:54 +02006457#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006458
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006459/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006460
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006461#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006462#define NEED_RETRY
6463#endif
6464
6465/* XXX This code is limited to "true" double-byte encodings, as
6466 a) it assumes an incomplete character consists of a single byte, and
6467 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006469
Alexander Belopolsky40018472011-02-26 01:02:56 +00006470static int
6471is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006472{
6473 const char *curr = s + offset;
6474
6475 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 const char *prev = CharPrev(s, curr);
6477 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006478 }
6479 return 0;
6480}
6481
6482/*
6483 * Decode MBCS string into unicode object. If 'final' is set, converts
6484 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6485 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486static int
6487decode_mbcs(PyUnicodeObject **v,
6488 const char *s, /* MBCS string */
6489 int size, /* sizeof MBCS string */
6490 int final,
6491 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006492{
6493 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006494 Py_ssize_t n;
6495 DWORD usize;
6496 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006497
6498 assert(size >= 0);
6499
Victor Stinner554f3f02010-06-16 23:33:54 +00006500 /* check and handle 'errors' arg */
6501 if (errors==NULL || strcmp(errors, "strict")==0)
6502 flags = MB_ERR_INVALID_CHARS;
6503 else if (strcmp(errors, "ignore")==0)
6504 flags = 0;
6505 else {
6506 PyErr_Format(PyExc_ValueError,
6507 "mbcs encoding does not support errors='%s'",
6508 errors);
6509 return -1;
6510 }
6511
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006512 /* Skip trailing lead-byte unless 'final' is set */
6513 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006515
6516 /* First get the size of the result */
6517 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006518 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6519 if (usize==0)
6520 goto mbcs_decode_error;
6521 } else
6522 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006523
6524 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 /* Create unicode object */
6526 *v = _PyUnicode_New(usize);
6527 if (*v == NULL)
6528 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006529 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006530 }
6531 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 /* Extend unicode object */
6533 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006534 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536 }
6537
6538 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006539 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006541 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6542 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006544 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006545 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006546
6547mbcs_decode_error:
6548 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6549 we raise a UnicodeDecodeError - else it is a 'generic'
6550 windows error
6551 */
6552 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6553 /* Ideally, we should get reason from FormatMessage - this
6554 is the Windows 2000 English version of the message
6555 */
6556 PyObject *exc = NULL;
6557 const char *reason = "No mapping for the Unicode character exists "
6558 "in the target multi-byte code page.";
6559 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6560 if (exc != NULL) {
6561 PyCodec_StrictErrors(exc);
6562 Py_DECREF(exc);
6563 }
6564 } else {
6565 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6566 }
6567 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570PyObject *
6571PyUnicode_DecodeMBCSStateful(const char *s,
6572 Py_ssize_t size,
6573 const char *errors,
6574 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006575{
6576 PyUnicodeObject *v = NULL;
6577 int done;
6578
6579 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006580 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006581
6582#ifdef NEED_RETRY
6583 retry:
6584 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006585 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586 else
6587#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006588 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589
6590 if (done < 0) {
6591 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006593 }
6594
6595 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006596 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597
6598#ifdef NEED_RETRY
6599 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 s += done;
6601 size -= done;
6602 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006603 }
6604#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006605 if (PyUnicode_READY(v) == -1) {
6606 Py_DECREF(v);
6607 return NULL;
6608 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006609 return (PyObject *)v;
6610}
6611
Alexander Belopolsky40018472011-02-26 01:02:56 +00006612PyObject *
6613PyUnicode_DecodeMBCS(const char *s,
6614 Py_ssize_t size,
6615 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006616{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6618}
6619
6620/*
6621 * Convert unicode into string object (MBCS).
6622 * Returns 0 if succeed, -1 otherwise.
6623 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006624static int
6625encode_mbcs(PyObject **repr,
6626 const Py_UNICODE *p, /* unicode */
6627 int size, /* size of unicode */
6628 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006629{
Victor Stinner554f3f02010-06-16 23:33:54 +00006630 BOOL usedDefaultChar = FALSE;
6631 BOOL *pusedDefaultChar;
6632 int mbcssize;
6633 Py_ssize_t n;
6634 PyObject *exc = NULL;
6635 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006636
6637 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006638
Victor Stinner554f3f02010-06-16 23:33:54 +00006639 /* check and handle 'errors' arg */
6640 if (errors==NULL || strcmp(errors, "strict")==0) {
6641 flags = WC_NO_BEST_FIT_CHARS;
6642 pusedDefaultChar = &usedDefaultChar;
6643 } else if (strcmp(errors, "replace")==0) {
6644 flags = 0;
6645 pusedDefaultChar = NULL;
6646 } else {
6647 PyErr_Format(PyExc_ValueError,
6648 "mbcs encoding does not support errors='%s'",
6649 errors);
6650 return -1;
6651 }
6652
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006653 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006654 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006655 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6656 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 if (mbcssize == 0) {
6658 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6659 return -1;
6660 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006661 /* If we used a default char, then we failed! */
6662 if (pusedDefaultChar && *pusedDefaultChar)
6663 goto mbcs_encode_error;
6664 } else {
6665 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006666 }
6667
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006668 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 /* Create string object */
6670 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6671 if (*repr == NULL)
6672 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006673 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006674 }
6675 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 /* Extend string object */
6677 n = PyBytes_Size(*repr);
6678 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6679 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006680 }
6681
6682 /* Do the conversion */
6683 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006685 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6686 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6688 return -1;
6689 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006690 if (pusedDefaultChar && *pusedDefaultChar)
6691 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006693 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006694
6695mbcs_encode_error:
6696 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6697 Py_XDECREF(exc);
6698 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006699}
6700
Alexander Belopolsky40018472011-02-26 01:02:56 +00006701PyObject *
6702PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6703 Py_ssize_t size,
6704 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006705{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706 PyObject *repr = NULL;
6707 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006708
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006709#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006710 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006712 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006713 else
6714#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006715 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006716
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 Py_XDECREF(repr);
6719 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006720 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006721
6722#ifdef NEED_RETRY
6723 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 p += INT_MAX;
6725 size -= INT_MAX;
6726 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006727 }
6728#endif
6729
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006730 return repr;
6731}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006732
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733PyObject *
6734PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006735{
6736 if (!PyUnicode_Check(unicode)) {
6737 PyErr_BadArgument();
6738 return NULL;
6739 }
6740 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 PyUnicode_GET_SIZE(unicode),
6742 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006743}
6744
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745#undef NEED_RETRY
6746
Victor Stinner99b95382011-07-04 14:23:54 +02006747#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749/* --- Character Mapping Codec -------------------------------------------- */
6750
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751PyObject *
6752PyUnicode_DecodeCharmap(const char *s,
6753 Py_ssize_t size,
6754 PyObject *mapping,
6755 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006758 Py_ssize_t startinpos;
6759 Py_ssize_t endinpos;
6760 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 PyUnicodeObject *v;
6763 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006764 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 PyObject *errorHandler = NULL;
6766 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006767 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006768 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 /* Default to Latin-1 */
6771 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773
6774 v = _PyUnicode_New(size);
6775 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006781 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 mapstring = PyUnicode_AS_UNICODE(mapping);
6783 maplen = PyUnicode_GET_SIZE(mapping);
6784 while (s < e) {
6785 unsigned char ch = *s;
6786 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (ch < maplen)
6789 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 if (x == 0xfffe) {
6792 /* undefined mapping */
6793 outpos = p-PyUnicode_AS_UNICODE(v);
6794 startinpos = s-starts;
6795 endinpos = startinpos+1;
6796 if (unicode_decode_call_errorhandler(
6797 errors, &errorHandler,
6798 "charmap", "character maps to <undefined>",
6799 &starts, &e, &startinpos, &endinpos, &exc, &s,
6800 &v, &outpos, &p)) {
6801 goto onError;
6802 }
6803 continue;
6804 }
6805 *p++ = x;
6806 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006807 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006808 }
6809 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 while (s < e) {
6811 unsigned char ch = *s;
6812 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006813
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6815 w = PyLong_FromLong((long)ch);
6816 if (w == NULL)
6817 goto onError;
6818 x = PyObject_GetItem(mapping, w);
6819 Py_DECREF(w);
6820 if (x == NULL) {
6821 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6822 /* No mapping found means: mapping is undefined. */
6823 PyErr_Clear();
6824 x = Py_None;
6825 Py_INCREF(x);
6826 } else
6827 goto onError;
6828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006829
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 /* Apply mapping */
6831 if (PyLong_Check(x)) {
6832 long value = PyLong_AS_LONG(x);
6833 if (value < 0 || value > 65535) {
6834 PyErr_SetString(PyExc_TypeError,
6835 "character mapping must be in range(65536)");
6836 Py_DECREF(x);
6837 goto onError;
6838 }
6839 *p++ = (Py_UNICODE)value;
6840 }
6841 else if (x == Py_None) {
6842 /* undefined mapping */
6843 outpos = p-PyUnicode_AS_UNICODE(v);
6844 startinpos = s-starts;
6845 endinpos = startinpos+1;
6846 if (unicode_decode_call_errorhandler(
6847 errors, &errorHandler,
6848 "charmap", "character maps to <undefined>",
6849 &starts, &e, &startinpos, &endinpos, &exc, &s,
6850 &v, &outpos, &p)) {
6851 Py_DECREF(x);
6852 goto onError;
6853 }
6854 Py_DECREF(x);
6855 continue;
6856 }
6857 else if (PyUnicode_Check(x)) {
6858 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006859
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 if (targetsize == 1)
6861 /* 1-1 mapping */
6862 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006863
Benjamin Peterson29060642009-01-31 22:14:21 +00006864 else if (targetsize > 1) {
6865 /* 1-n mapping */
6866 if (targetsize > extrachars) {
6867 /* resize first */
6868 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6869 Py_ssize_t needed = (targetsize - extrachars) + \
6870 (targetsize << 2);
6871 extrachars += needed;
6872 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006873 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 PyUnicode_GET_SIZE(v) + needed) < 0) {
6875 Py_DECREF(x);
6876 goto onError;
6877 }
6878 p = PyUnicode_AS_UNICODE(v) + oldpos;
6879 }
6880 Py_UNICODE_COPY(p,
6881 PyUnicode_AS_UNICODE(x),
6882 targetsize);
6883 p += targetsize;
6884 extrachars -= targetsize;
6885 }
6886 /* 1-0 mapping: skip the character */
6887 }
6888 else {
6889 /* wrong return value */
6890 PyErr_SetString(PyExc_TypeError,
6891 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006892 Py_DECREF(x);
6893 goto onError;
6894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 Py_DECREF(x);
6896 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 }
6899 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006900 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006902 Py_XDECREF(errorHandler);
6903 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006904 if (PyUnicode_READY(v) == -1) {
6905 Py_DECREF(v);
6906 return NULL;
6907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006909
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006911 Py_XDECREF(errorHandler);
6912 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 Py_XDECREF(v);
6914 return NULL;
6915}
6916
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006917/* Charmap encoding: the lookup table */
6918
Alexander Belopolsky40018472011-02-26 01:02:56 +00006919struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 PyObject_HEAD
6921 unsigned char level1[32];
6922 int count2, count3;
6923 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006924};
6925
6926static PyObject*
6927encoding_map_size(PyObject *obj, PyObject* args)
6928{
6929 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006930 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006932}
6933
6934static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006935 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 PyDoc_STR("Return the size (in bytes) of this object") },
6937 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006938};
6939
6940static void
6941encoding_map_dealloc(PyObject* o)
6942{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006943 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006944}
6945
6946static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006947 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 "EncodingMap", /*tp_name*/
6949 sizeof(struct encoding_map), /*tp_basicsize*/
6950 0, /*tp_itemsize*/
6951 /* methods */
6952 encoding_map_dealloc, /*tp_dealloc*/
6953 0, /*tp_print*/
6954 0, /*tp_getattr*/
6955 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006956 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 0, /*tp_repr*/
6958 0, /*tp_as_number*/
6959 0, /*tp_as_sequence*/
6960 0, /*tp_as_mapping*/
6961 0, /*tp_hash*/
6962 0, /*tp_call*/
6963 0, /*tp_str*/
6964 0, /*tp_getattro*/
6965 0, /*tp_setattro*/
6966 0, /*tp_as_buffer*/
6967 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6968 0, /*tp_doc*/
6969 0, /*tp_traverse*/
6970 0, /*tp_clear*/
6971 0, /*tp_richcompare*/
6972 0, /*tp_weaklistoffset*/
6973 0, /*tp_iter*/
6974 0, /*tp_iternext*/
6975 encoding_map_methods, /*tp_methods*/
6976 0, /*tp_members*/
6977 0, /*tp_getset*/
6978 0, /*tp_base*/
6979 0, /*tp_dict*/
6980 0, /*tp_descr_get*/
6981 0, /*tp_descr_set*/
6982 0, /*tp_dictoffset*/
6983 0, /*tp_init*/
6984 0, /*tp_alloc*/
6985 0, /*tp_new*/
6986 0, /*tp_free*/
6987 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006988};
6989
6990PyObject*
6991PyUnicode_BuildEncodingMap(PyObject* string)
6992{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006993 PyObject *result;
6994 struct encoding_map *mresult;
6995 int i;
6996 int need_dict = 0;
6997 unsigned char level1[32];
6998 unsigned char level2[512];
6999 unsigned char *mlevel1, *mlevel2, *mlevel3;
7000 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007001 int kind;
7002 void *data;
7003 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007005 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007006 PyErr_BadArgument();
7007 return NULL;
7008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007009 kind = PyUnicode_KIND(string);
7010 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007011 memset(level1, 0xFF, sizeof level1);
7012 memset(level2, 0xFF, sizeof level2);
7013
7014 /* If there isn't a one-to-one mapping of NULL to \0,
7015 or if there are non-BMP characters, we need to use
7016 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007017 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007018 need_dict = 1;
7019 for (i = 1; i < 256; i++) {
7020 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007021 ch = PyUnicode_READ(kind, data, i);
7022 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007023 need_dict = 1;
7024 break;
7025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007027 /* unmapped character */
7028 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007029 l1 = ch >> 11;
7030 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007031 if (level1[l1] == 0xFF)
7032 level1[l1] = count2++;
7033 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007034 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007035 }
7036
7037 if (count2 >= 0xFF || count3 >= 0xFF)
7038 need_dict = 1;
7039
7040 if (need_dict) {
7041 PyObject *result = PyDict_New();
7042 PyObject *key, *value;
7043 if (!result)
7044 return NULL;
7045 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007047 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007048 if (!key || !value)
7049 goto failed1;
7050 if (PyDict_SetItem(result, key, value) == -1)
7051 goto failed1;
7052 Py_DECREF(key);
7053 Py_DECREF(value);
7054 }
7055 return result;
7056 failed1:
7057 Py_XDECREF(key);
7058 Py_XDECREF(value);
7059 Py_DECREF(result);
7060 return NULL;
7061 }
7062
7063 /* Create a three-level trie */
7064 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7065 16*count2 + 128*count3 - 1);
7066 if (!result)
7067 return PyErr_NoMemory();
7068 PyObject_Init(result, &EncodingMapType);
7069 mresult = (struct encoding_map*)result;
7070 mresult->count2 = count2;
7071 mresult->count3 = count3;
7072 mlevel1 = mresult->level1;
7073 mlevel2 = mresult->level23;
7074 mlevel3 = mresult->level23 + 16*count2;
7075 memcpy(mlevel1, level1, 32);
7076 memset(mlevel2, 0xFF, 16*count2);
7077 memset(mlevel3, 0, 128*count3);
7078 count3 = 0;
7079 for (i = 1; i < 256; i++) {
7080 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007081 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007082 /* unmapped character */
7083 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007084 o1 = PyUnicode_READ(kind, data, i)>>11;
7085 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007086 i2 = 16*mlevel1[o1] + o2;
7087 if (mlevel2[i2] == 0xFF)
7088 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007089 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007090 i3 = 128*mlevel2[i2] + o3;
7091 mlevel3[i3] = i;
7092 }
7093 return result;
7094}
7095
7096static int
7097encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7098{
7099 struct encoding_map *map = (struct encoding_map*)mapping;
7100 int l1 = c>>11;
7101 int l2 = (c>>7) & 0xF;
7102 int l3 = c & 0x7F;
7103 int i;
7104
7105#ifdef Py_UNICODE_WIDE
7106 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007107 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007108 }
7109#endif
7110 if (c == 0)
7111 return 0;
7112 /* level 1*/
7113 i = map->level1[l1];
7114 if (i == 0xFF) {
7115 return -1;
7116 }
7117 /* level 2*/
7118 i = map->level23[16*i+l2];
7119 if (i == 0xFF) {
7120 return -1;
7121 }
7122 /* level 3 */
7123 i = map->level23[16*map->count2 + 128*i + l3];
7124 if (i == 0) {
7125 return -1;
7126 }
7127 return i;
7128}
7129
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007130/* Lookup the character ch in the mapping. If the character
7131 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007132 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007133static PyObject *
7134charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
Christian Heimes217cfd12007-12-02 14:31:20 +00007136 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137 PyObject *x;
7138
7139 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007140 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007141 x = PyObject_GetItem(mapping, w);
7142 Py_DECREF(w);
7143 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7145 /* No mapping found means: mapping is undefined. */
7146 PyErr_Clear();
7147 x = Py_None;
7148 Py_INCREF(x);
7149 return x;
7150 } else
7151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007153 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007155 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 long value = PyLong_AS_LONG(x);
7157 if (value < 0 || value > 255) {
7158 PyErr_SetString(PyExc_TypeError,
7159 "character mapping must be in range(256)");
7160 Py_DECREF(x);
7161 return NULL;
7162 }
7163 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007165 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 /* wrong return value */
7169 PyErr_Format(PyExc_TypeError,
7170 "character mapping must return integer, bytes or None, not %.400s",
7171 x->ob_type->tp_name);
7172 Py_DECREF(x);
7173 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 }
7175}
7176
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007177static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007178charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007179{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007180 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7181 /* exponentially overallocate to minimize reallocations */
7182 if (requiredsize < 2*outsize)
7183 requiredsize = 2*outsize;
7184 if (_PyBytes_Resize(outobj, requiredsize))
7185 return -1;
7186 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007187}
7188
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007191} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007192/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007193 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 space is available. Return a new reference to the object that
7195 was put in the output buffer, or Py_None, if the mapping was undefined
7196 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007197 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007198static charmapencode_result
7199charmapencode_output(Py_UNICODE c, PyObject *mapping,
7200 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007202 PyObject *rep;
7203 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007204 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007205
Christian Heimes90aa7642007-12-19 02:45:37 +00007206 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007207 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007209 if (res == -1)
7210 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 if (outsize<requiredsize)
7212 if (charmapencode_resize(outobj, outpos, requiredsize))
7213 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007214 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 outstart[(*outpos)++] = (char)res;
7216 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007217 }
7218
7219 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007220 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007222 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 Py_DECREF(rep);
7224 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007225 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 if (PyLong_Check(rep)) {
7227 Py_ssize_t requiredsize = *outpos+1;
7228 if (outsize<requiredsize)
7229 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7230 Py_DECREF(rep);
7231 return enc_EXCEPTION;
7232 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007233 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007234 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007235 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 else {
7237 const char *repchars = PyBytes_AS_STRING(rep);
7238 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7239 Py_ssize_t requiredsize = *outpos+repsize;
7240 if (outsize<requiredsize)
7241 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7242 Py_DECREF(rep);
7243 return enc_EXCEPTION;
7244 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007245 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 memcpy(outstart + *outpos, repchars, repsize);
7247 *outpos += repsize;
7248 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007250 Py_DECREF(rep);
7251 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007252}
7253
7254/* handle an error in PyUnicode_EncodeCharmap
7255 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256static int
7257charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007260 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007261 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262{
7263 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264 Py_ssize_t repsize;
7265 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007266 Py_UNICODE *uni2;
7267 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t collstartpos = *inpos;
7269 Py_ssize_t collendpos = *inpos+1;
7270 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271 char *encoding = "charmap";
7272 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007273 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007275 /* find all unencodable characters */
7276 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007277 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007278 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007279 int res = encoding_map_lookup(p[collendpos], mapping);
7280 if (res != -1)
7281 break;
7282 ++collendpos;
7283 continue;
7284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007285
Benjamin Peterson29060642009-01-31 22:14:21 +00007286 rep = charmapencode_lookup(p[collendpos], mapping);
7287 if (rep==NULL)
7288 return -1;
7289 else if (rep!=Py_None) {
7290 Py_DECREF(rep);
7291 break;
7292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007293 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007294 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007295 }
7296 /* cache callback name lookup
7297 * (if not done yet, i.e. it's the first error) */
7298 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007299 if ((errors==NULL) || (!strcmp(errors, "strict")))
7300 *known_errorHandler = 1;
7301 else if (!strcmp(errors, "replace"))
7302 *known_errorHandler = 2;
7303 else if (!strcmp(errors, "ignore"))
7304 *known_errorHandler = 3;
7305 else if (!strcmp(errors, "xmlcharrefreplace"))
7306 *known_errorHandler = 4;
7307 else
7308 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007309 }
7310 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007311 case 1: /* strict */
7312 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7313 return -1;
7314 case 2: /* replace */
7315 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007316 x = charmapencode_output('?', mapping, res, respos);
7317 if (x==enc_EXCEPTION) {
7318 return -1;
7319 }
7320 else if (x==enc_FAILED) {
7321 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7322 return -1;
7323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007324 }
7325 /* fall through */
7326 case 3: /* ignore */
7327 *inpos = collendpos;
7328 break;
7329 case 4: /* xmlcharrefreplace */
7330 /* generate replacement (temporarily (mis)uses p) */
7331 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 char buffer[2+29+1+1];
7333 char *cp;
7334 sprintf(buffer, "&#%d;", (int)p[collpos]);
7335 for (cp = buffer; *cp; ++cp) {
7336 x = charmapencode_output(*cp, mapping, res, respos);
7337 if (x==enc_EXCEPTION)
7338 return -1;
7339 else if (x==enc_FAILED) {
7340 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7341 return -1;
7342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343 }
7344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007345 *inpos = collendpos;
7346 break;
7347 default:
7348 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 encoding, reason, p, size, exceptionObject,
7350 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007351 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007353 if (PyBytes_Check(repunicode)) {
7354 /* Directly copy bytes result to output. */
7355 Py_ssize_t outsize = PyBytes_Size(*res);
7356 Py_ssize_t requiredsize;
7357 repsize = PyBytes_Size(repunicode);
7358 requiredsize = *respos + repsize;
7359 if (requiredsize > outsize)
7360 /* Make room for all additional bytes. */
7361 if (charmapencode_resize(res, respos, requiredsize)) {
7362 Py_DECREF(repunicode);
7363 return -1;
7364 }
7365 memcpy(PyBytes_AsString(*res) + *respos,
7366 PyBytes_AsString(repunicode), repsize);
7367 *respos += repsize;
7368 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007369 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007370 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007371 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007372 /* generate replacement */
7373 repsize = PyUnicode_GET_SIZE(repunicode);
7374 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 x = charmapencode_output(*uni2, mapping, res, respos);
7376 if (x==enc_EXCEPTION) {
7377 return -1;
7378 }
7379 else if (x==enc_FAILED) {
7380 Py_DECREF(repunicode);
7381 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7382 return -1;
7383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007384 }
7385 *inpos = newpos;
7386 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007387 }
7388 return 0;
7389}
7390
Alexander Belopolsky40018472011-02-26 01:02:56 +00007391PyObject *
7392PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7393 Py_ssize_t size,
7394 PyObject *mapping,
7395 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007397 /* output object */
7398 PyObject *res = NULL;
7399 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007400 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007401 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007402 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 PyObject *errorHandler = NULL;
7404 PyObject *exc = NULL;
7405 /* the following variable is used for caching string comparisons
7406 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7407 * 3=ignore, 4=xmlcharrefreplace */
7408 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410 /* Default to Latin-1 */
7411 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007414 /* allocate enough for a simple encoding without
7415 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007416 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007417 if (res == NULL)
7418 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007419 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007422 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 /* try to encode it */
7424 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7425 if (x==enc_EXCEPTION) /* error */
7426 goto onError;
7427 if (x==enc_FAILED) { /* unencodable character */
7428 if (charmap_encoding_error(p, size, &inpos, mapping,
7429 &exc,
7430 &known_errorHandler, &errorHandler, errors,
7431 &res, &respos)) {
7432 goto onError;
7433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007434 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 else
7436 /* done with this character => adjust input position */
7437 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007440 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007441 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007442 if (_PyBytes_Resize(&res, respos) < 0)
7443 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445 Py_XDECREF(exc);
7446 Py_XDECREF(errorHandler);
7447 return res;
7448
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 Py_XDECREF(res);
7451 Py_XDECREF(exc);
7452 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 return NULL;
7454}
7455
Alexander Belopolsky40018472011-02-26 01:02:56 +00007456PyObject *
7457PyUnicode_AsCharmapString(PyObject *unicode,
7458 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459{
7460 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 PyErr_BadArgument();
7462 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 }
7464 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 PyUnicode_GET_SIZE(unicode),
7466 mapping,
7467 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468}
7469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007470/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007471static void
7472make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007473 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007474 Py_ssize_t startpos, Py_ssize_t endpos,
7475 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007477 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007478 *exceptionObject = _PyUnicodeTranslateError_Create(
7479 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 }
7481 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007482 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7483 goto onError;
7484 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7485 goto onError;
7486 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7487 goto onError;
7488 return;
7489 onError:
7490 Py_DECREF(*exceptionObject);
7491 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 }
7493}
7494
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007495/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496static void
7497raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007498 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007499 Py_ssize_t startpos, Py_ssize_t endpos,
7500 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007501{
7502 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007503 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007504 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506}
7507
7508/* error handling callback helper:
7509 build arguments, call the callback and check the arguments,
7510 put the result into newpos and return the replacement string, which
7511 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007512static PyObject *
7513unicode_translate_call_errorhandler(const char *errors,
7514 PyObject **errorHandler,
7515 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007516 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007517 Py_ssize_t startpos, Py_ssize_t endpos,
7518 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007520 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007521
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007522 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007523 PyObject *restuple;
7524 PyObject *resunicode;
7525
7526 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007528 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007530 }
7531
7532 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007533 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007534 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536
7537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007542 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 Py_DECREF(restuple);
7544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545 }
7546 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 &resunicode, &i_newpos)) {
7548 Py_DECREF(restuple);
7549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007550 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007551 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007552 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 else
7554 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7557 Py_DECREF(restuple);
7558 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007559 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007560 Py_INCREF(resunicode);
7561 Py_DECREF(restuple);
7562 return resunicode;
7563}
7564
7565/* Lookup the character ch in the mapping and put the result in result,
7566 which must be decrefed by the caller.
7567 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007568static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007570{
Christian Heimes217cfd12007-12-02 14:31:20 +00007571 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007572 PyObject *x;
7573
7574 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576 x = PyObject_GetItem(mapping, w);
7577 Py_DECREF(w);
7578 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7580 /* No mapping found means: use 1:1 mapping. */
7581 PyErr_Clear();
7582 *result = NULL;
7583 return 0;
7584 } else
7585 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007586 }
7587 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 *result = x;
7589 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007590 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007591 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 long value = PyLong_AS_LONG(x);
7593 long max = PyUnicode_GetMax();
7594 if (value < 0 || value > max) {
7595 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007596 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 Py_DECREF(x);
7598 return -1;
7599 }
7600 *result = x;
7601 return 0;
7602 }
7603 else if (PyUnicode_Check(x)) {
7604 *result = x;
7605 return 0;
7606 }
7607 else {
7608 /* wrong return value */
7609 PyErr_SetString(PyExc_TypeError,
7610 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 Py_DECREF(x);
7612 return -1;
7613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007614}
7615/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 if not reallocate and adjust various state variables.
7617 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007618static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007619charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007622 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007623 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 /* exponentially overallocate to minimize reallocations */
7625 if (requiredsize < 2 * oldsize)
7626 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7628 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007630 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007631 }
7632 return 0;
7633}
7634/* lookup the character, put the result in the output string and adjust
7635 various state variables. Return a new reference to the object that
7636 was put in the output buffer in *result, or Py_None, if the mapping was
7637 undefined (in which case no character was written).
7638 The called must decref result.
7639 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007640static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007641charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7642 PyObject *mapping, Py_UCS4 **output,
7643 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007644 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007646 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7647 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007651 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007652 }
7653 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007655 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007657 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 }
7659 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007660 Py_ssize_t repsize;
7661 if (PyUnicode_READY(*res) == -1)
7662 return -1;
7663 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 if (repsize==1) {
7665 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007666 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 }
7668 else if (repsize!=0) {
7669 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007670 Py_ssize_t requiredsize = *opos +
7671 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007673 Py_ssize_t i;
7674 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007676 for(i = 0; i < repsize; i++)
7677 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679 }
7680 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007682 return 0;
7683}
7684
Alexander Belopolsky40018472011-02-26 01:02:56 +00007685PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007686_PyUnicode_TranslateCharmap(PyObject *input,
7687 PyObject *mapping,
7688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007690 /* input object */
7691 char *idata;
7692 Py_ssize_t size, i;
7693 int kind;
7694 /* output buffer */
7695 Py_UCS4 *output = NULL;
7696 Py_ssize_t osize;
7697 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007700 char *reason = "character maps to <undefined>";
7701 PyObject *errorHandler = NULL;
7702 PyObject *exc = NULL;
7703 /* the following variable is used for caching string comparisons
7704 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7705 * 3=ignore, 4=xmlcharrefreplace */
7706 int known_errorHandler = -1;
7707
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 PyErr_BadArgument();
7710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713 if (PyUnicode_READY(input) == -1)
7714 return NULL;
7715 idata = (char*)PyUnicode_DATA(input);
7716 kind = PyUnicode_KIND(input);
7717 size = PyUnicode_GET_LENGTH(input);
7718 i = 0;
7719
7720 if (size == 0) {
7721 Py_INCREF(input);
7722 return input;
7723 }
7724
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007725 /* allocate enough for a simple 1:1 translation without
7726 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 osize = size;
7728 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7729 opos = 0;
7730 if (output == NULL) {
7731 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 /* try to encode it */
7737 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007738 if (charmaptranslate_output(input, i, mapping,
7739 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 Py_XDECREF(x);
7741 goto onError;
7742 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007743 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007745 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 else { /* untranslatable character */
7747 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7748 Py_ssize_t repsize;
7749 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007752 Py_ssize_t collstart = i;
7753 Py_ssize_t collend = i+1;
7754 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007757 while (collend < size) {
7758 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 goto onError;
7760 Py_XDECREF(x);
7761 if (x!=Py_None)
7762 break;
7763 ++collend;
7764 }
7765 /* cache callback name lookup
7766 * (if not done yet, i.e. it's the first error) */
7767 if (known_errorHandler==-1) {
7768 if ((errors==NULL) || (!strcmp(errors, "strict")))
7769 known_errorHandler = 1;
7770 else if (!strcmp(errors, "replace"))
7771 known_errorHandler = 2;
7772 else if (!strcmp(errors, "ignore"))
7773 known_errorHandler = 3;
7774 else if (!strcmp(errors, "xmlcharrefreplace"))
7775 known_errorHandler = 4;
7776 else
7777 known_errorHandler = 0;
7778 }
7779 switch (known_errorHandler) {
7780 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007781 raise_translate_exception(&exc, input, collstart,
7782 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 case 2: /* replace */
7785 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007786 for (coll = collstart; coll<collend; coll++)
7787 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 /* fall through */
7789 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 break;
7792 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007793 /* generate replacement (temporarily (mis)uses i) */
7794 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 char buffer[2+29+1+1];
7796 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007797 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7798 if (charmaptranslate_makespace(&output, &osize,
7799 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 goto onError;
7801 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 break;
7806 default:
7807 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 reason, input, &exc,
7809 collstart, collend, &newpos);
7810 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 goto onError;
7812 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813 repsize = PyUnicode_GET_LENGTH(repunicode);
7814 if (charmaptranslate_makespace(&output, &osize,
7815 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 Py_DECREF(repunicode);
7817 goto onError;
7818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007819 for (uni2 = 0; repsize-->0; ++uni2)
7820 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7821 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007822 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007824 }
7825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7827 if (!res)
7828 goto onError;
7829 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 Py_XDECREF(exc);
7831 Py_XDECREF(errorHandler);
7832 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007835 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836 Py_XDECREF(exc);
7837 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 return NULL;
7839}
7840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007841/* Deprecated. Use PyUnicode_Translate instead. */
7842PyObject *
7843PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7844 Py_ssize_t size,
7845 PyObject *mapping,
7846 const char *errors)
7847{
7848 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7849 if (!unicode)
7850 return NULL;
7851 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7852}
7853
Alexander Belopolsky40018472011-02-26 01:02:56 +00007854PyObject *
7855PyUnicode_Translate(PyObject *str,
7856 PyObject *mapping,
7857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
7859 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007860
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 str = PyUnicode_FromObject(str);
7862 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 Py_DECREF(str);
7866 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007867
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 Py_XDECREF(str);
7870 return NULL;
7871}
Tim Petersced69f82003-09-16 20:30:58 +00007872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007873static Py_UCS4
7874fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7875{
7876 /* No need to call PyUnicode_READY(self) because this function is only
7877 called as a callback from fixup() which does it already. */
7878 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7879 const int kind = PyUnicode_KIND(self);
7880 void *data = PyUnicode_DATA(self);
7881 Py_UCS4 maxchar = 0, ch, fixed;
7882 Py_ssize_t i;
7883
7884 for (i = 0; i < len; ++i) {
7885 ch = PyUnicode_READ(kind, data, i);
7886 fixed = 0;
7887 if (ch > 127) {
7888 if (Py_UNICODE_ISSPACE(ch))
7889 fixed = ' ';
7890 else {
7891 const int decimal = Py_UNICODE_TODECIMAL(ch);
7892 if (decimal >= 0)
7893 fixed = '0' + decimal;
7894 }
7895 if (fixed != 0) {
7896 if (fixed > maxchar)
7897 maxchar = fixed;
7898 PyUnicode_WRITE(kind, data, i, fixed);
7899 }
7900 else if (ch > maxchar)
7901 maxchar = ch;
7902 }
7903 else if (ch > maxchar)
7904 maxchar = ch;
7905 }
7906
7907 return maxchar;
7908}
7909
7910PyObject *
7911_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7912{
7913 if (!PyUnicode_Check(unicode)) {
7914 PyErr_BadInternalCall();
7915 return NULL;
7916 }
7917 if (PyUnicode_READY(unicode) == -1)
7918 return NULL;
7919 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7920 /* If the string is already ASCII, just return the same string */
7921 Py_INCREF(unicode);
7922 return unicode;
7923 }
7924 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7925}
7926
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007927PyObject *
7928PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7929 Py_ssize_t length)
7930{
7931 PyObject *result;
7932 Py_UNICODE *p; /* write pointer into result */
7933 Py_ssize_t i;
7934 /* Copy to a new string */
7935 result = (PyObject *)_PyUnicode_New(length);
7936 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7937 if (result == NULL)
7938 return result;
7939 p = PyUnicode_AS_UNICODE(result);
7940 /* Iterate over code points */
7941 for (i = 0; i < length; i++) {
7942 Py_UNICODE ch =s[i];
7943 if (ch > 127) {
7944 int decimal = Py_UNICODE_TODECIMAL(ch);
7945 if (decimal >= 0)
7946 p[i] = '0' + decimal;
7947 }
7948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7950 Py_DECREF(result);
7951 return NULL;
7952 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007953 return result;
7954}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007955/* --- Decimal Encoder ---------------------------------------------------- */
7956
Alexander Belopolsky40018472011-02-26 01:02:56 +00007957int
7958PyUnicode_EncodeDecimal(Py_UNICODE *s,
7959 Py_ssize_t length,
7960 char *output,
7961 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007962{
7963 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 PyObject *errorHandler = NULL;
7965 PyObject *exc = NULL;
7966 const char *encoding = "decimal";
7967 const char *reason = "invalid decimal Unicode string";
7968 /* the following variable is used for caching string comparisons
7969 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7970 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007971
7972 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 PyErr_BadArgument();
7974 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007975 }
7976
7977 p = s;
7978 end = s + length;
7979 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 register Py_UNICODE ch = *p;
7981 int decimal;
7982 PyObject *repunicode;
7983 Py_ssize_t repsize;
7984 Py_ssize_t newpos;
7985 Py_UNICODE *uni2;
7986 Py_UNICODE *collstart;
7987 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007988
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 ++p;
7992 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007993 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 decimal = Py_UNICODE_TODECIMAL(ch);
7995 if (decimal >= 0) {
7996 *output++ = '0' + decimal;
7997 ++p;
7998 continue;
7999 }
8000 if (0 < ch && ch < 256) {
8001 *output++ = (char)ch;
8002 ++p;
8003 continue;
8004 }
8005 /* All other characters are considered unencodable */
8006 collstart = p;
8007 collend = p+1;
8008 while (collend < end) {
8009 if ((0 < *collend && *collend < 256) ||
8010 !Py_UNICODE_ISSPACE(*collend) ||
8011 Py_UNICODE_TODECIMAL(*collend))
8012 break;
8013 }
8014 /* cache callback name lookup
8015 * (if not done yet, i.e. it's the first error) */
8016 if (known_errorHandler==-1) {
8017 if ((errors==NULL) || (!strcmp(errors, "strict")))
8018 known_errorHandler = 1;
8019 else if (!strcmp(errors, "replace"))
8020 known_errorHandler = 2;
8021 else if (!strcmp(errors, "ignore"))
8022 known_errorHandler = 3;
8023 else if (!strcmp(errors, "xmlcharrefreplace"))
8024 known_errorHandler = 4;
8025 else
8026 known_errorHandler = 0;
8027 }
8028 switch (known_errorHandler) {
8029 case 1: /* strict */
8030 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8031 goto onError;
8032 case 2: /* replace */
8033 for (p = collstart; p < collend; ++p)
8034 *output++ = '?';
8035 /* fall through */
8036 case 3: /* ignore */
8037 p = collend;
8038 break;
8039 case 4: /* xmlcharrefreplace */
8040 /* generate replacement (temporarily (mis)uses p) */
8041 for (p = collstart; p < collend; ++p)
8042 output += sprintf(output, "&#%d;", (int)*p);
8043 p = collend;
8044 break;
8045 default:
8046 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8047 encoding, reason, s, length, &exc,
8048 collstart-s, collend-s, &newpos);
8049 if (repunicode == NULL)
8050 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008051 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008052 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008053 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8054 Py_DECREF(repunicode);
8055 goto onError;
8056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 /* generate replacement */
8058 repsize = PyUnicode_GET_SIZE(repunicode);
8059 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8060 Py_UNICODE ch = *uni2;
8061 if (Py_UNICODE_ISSPACE(ch))
8062 *output++ = ' ';
8063 else {
8064 decimal = Py_UNICODE_TODECIMAL(ch);
8065 if (decimal >= 0)
8066 *output++ = '0' + decimal;
8067 else if (0 < ch && ch < 256)
8068 *output++ = (char)ch;
8069 else {
8070 Py_DECREF(repunicode);
8071 raise_encode_exception(&exc, encoding,
8072 s, length, collstart-s, collend-s, reason);
8073 goto onError;
8074 }
8075 }
8076 }
8077 p = s + newpos;
8078 Py_DECREF(repunicode);
8079 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008080 }
8081 /* 0-terminate the output string */
8082 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083 Py_XDECREF(exc);
8084 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008085 return 0;
8086
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 Py_XDECREF(exc);
8089 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008090 return -1;
8091}
8092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093/* --- Helpers ------------------------------------------------------------ */
8094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008095#include "stringlib/ucs1lib.h"
8096#include "stringlib/fastsearch.h"
8097#include "stringlib/partition.h"
8098#include "stringlib/split.h"
8099#include "stringlib/count.h"
8100#include "stringlib/find.h"
8101#include "stringlib/localeutil.h"
8102#include "stringlib/undef.h"
8103
8104#include "stringlib/ucs2lib.h"
8105#include "stringlib/fastsearch.h"
8106#include "stringlib/partition.h"
8107#include "stringlib/split.h"
8108#include "stringlib/count.h"
8109#include "stringlib/find.h"
8110#include "stringlib/localeutil.h"
8111#include "stringlib/undef.h"
8112
8113#include "stringlib/ucs4lib.h"
8114#include "stringlib/fastsearch.h"
8115#include "stringlib/partition.h"
8116#include "stringlib/split.h"
8117#include "stringlib/count.h"
8118#include "stringlib/find.h"
8119#include "stringlib/localeutil.h"
8120#include "stringlib/undef.h"
8121
8122static Py_ssize_t
8123any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8124 const Py_UCS1*, Py_ssize_t,
8125 Py_ssize_t, Py_ssize_t),
8126 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8127 const Py_UCS2*, Py_ssize_t,
8128 Py_ssize_t, Py_ssize_t),
8129 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8130 const Py_UCS4*, Py_ssize_t,
8131 Py_ssize_t, Py_ssize_t),
8132 PyObject* s1, PyObject* s2,
8133 Py_ssize_t start,
8134 Py_ssize_t end)
8135{
8136 int kind1, kind2, kind;
8137 void *buf1, *buf2;
8138 Py_ssize_t len1, len2, result;
8139
8140 kind1 = PyUnicode_KIND(s1);
8141 kind2 = PyUnicode_KIND(s2);
8142 kind = kind1 > kind2 ? kind1 : kind2;
8143 buf1 = PyUnicode_DATA(s1);
8144 buf2 = PyUnicode_DATA(s2);
8145 if (kind1 != kind)
8146 buf1 = _PyUnicode_AsKind(s1, kind);
8147 if (!buf1)
8148 return -2;
8149 if (kind2 != kind)
8150 buf2 = _PyUnicode_AsKind(s2, kind);
8151 if (!buf2) {
8152 if (kind1 != kind) PyMem_Free(buf1);
8153 return -2;
8154 }
8155 len1 = PyUnicode_GET_LENGTH(s1);
8156 len2 = PyUnicode_GET_LENGTH(s2);
8157
8158 switch(kind) {
8159 case PyUnicode_1BYTE_KIND:
8160 result = ucs1(buf1, len1, buf2, len2, start, end);
8161 break;
8162 case PyUnicode_2BYTE_KIND:
8163 result = ucs2(buf1, len1, buf2, len2, start, end);
8164 break;
8165 case PyUnicode_4BYTE_KIND:
8166 result = ucs4(buf1, len1, buf2, len2, start, end);
8167 break;
8168 default:
8169 assert(0); result = -2;
8170 }
8171
8172 if (kind1 != kind)
8173 PyMem_Free(buf1);
8174 if (kind2 != kind)
8175 PyMem_Free(buf2);
8176
8177 return result;
8178}
8179
8180Py_ssize_t
8181_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8182 Py_ssize_t n_buffer,
8183 void *digits, Py_ssize_t n_digits,
8184 Py_ssize_t min_width,
8185 const char *grouping,
8186 const char *thousands_sep)
8187{
8188 switch(kind) {
8189 case PyUnicode_1BYTE_KIND:
8190 return _PyUnicode_ucs1_InsertThousandsGrouping(
8191 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8192 min_width, grouping, thousands_sep);
8193 case PyUnicode_2BYTE_KIND:
8194 return _PyUnicode_ucs2_InsertThousandsGrouping(
8195 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8196 min_width, grouping, thousands_sep);
8197 case PyUnicode_4BYTE_KIND:
8198 return _PyUnicode_ucs4_InsertThousandsGrouping(
8199 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8200 min_width, grouping, thousands_sep);
8201 }
8202 assert(0);
8203 return -1;
8204}
8205
8206
Eric Smith8c663262007-08-25 02:26:07 +00008207#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008208#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008209
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210#include "stringlib/count.h"
8211#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008212
Thomas Wouters477c8d52006-05-27 19:21:47 +00008213/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008214#define ADJUST_INDICES(start, end, len) \
8215 if (end > len) \
8216 end = len; \
8217 else if (end < 0) { \
8218 end += len; \
8219 if (end < 0) \
8220 end = 0; \
8221 } \
8222 if (start < 0) { \
8223 start += len; \
8224 if (start < 0) \
8225 start = 0; \
8226 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008227
Alexander Belopolsky40018472011-02-26 01:02:56 +00008228Py_ssize_t
8229PyUnicode_Count(PyObject *str,
8230 PyObject *substr,
8231 Py_ssize_t start,
8232 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008234 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008235 PyUnicodeObject* str_obj;
8236 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 int kind1, kind2, kind;
8238 void *buf1 = NULL, *buf2 = NULL;
8239 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008240
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008244 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008245 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 Py_DECREF(str_obj);
8247 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 }
Tim Petersced69f82003-09-16 20:30:58 +00008249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 kind1 = PyUnicode_KIND(str_obj);
8251 kind2 = PyUnicode_KIND(sub_obj);
8252 kind = kind1 > kind2 ? kind1 : kind2;
8253 buf1 = PyUnicode_DATA(str_obj);
8254 if (kind1 != kind)
8255 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8256 if (!buf1)
8257 goto onError;
8258 buf2 = PyUnicode_DATA(sub_obj);
8259 if (kind2 != kind)
8260 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8261 if (!buf2)
8262 goto onError;
8263 len1 = PyUnicode_GET_LENGTH(str_obj);
8264 len2 = PyUnicode_GET_LENGTH(sub_obj);
8265
8266 ADJUST_INDICES(start, end, len1);
8267 switch(kind) {
8268 case PyUnicode_1BYTE_KIND:
8269 result = ucs1lib_count(
8270 ((Py_UCS1*)buf1) + start, end - start,
8271 buf2, len2, PY_SSIZE_T_MAX
8272 );
8273 break;
8274 case PyUnicode_2BYTE_KIND:
8275 result = ucs2lib_count(
8276 ((Py_UCS2*)buf1) + start, end - start,
8277 buf2, len2, PY_SSIZE_T_MAX
8278 );
8279 break;
8280 case PyUnicode_4BYTE_KIND:
8281 result = ucs4lib_count(
8282 ((Py_UCS4*)buf1) + start, end - start,
8283 buf2, len2, PY_SSIZE_T_MAX
8284 );
8285 break;
8286 default:
8287 assert(0); result = 0;
8288 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008289
8290 Py_DECREF(sub_obj);
8291 Py_DECREF(str_obj);
8292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 if (kind1 != kind)
8294 PyMem_Free(buf1);
8295 if (kind2 != kind)
8296 PyMem_Free(buf2);
8297
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008299 onError:
8300 Py_DECREF(sub_obj);
8301 Py_DECREF(str_obj);
8302 if (kind1 != kind && buf1)
8303 PyMem_Free(buf1);
8304 if (kind2 != kind && buf2)
8305 PyMem_Free(buf2);
8306 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307}
8308
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309Py_ssize_t
8310PyUnicode_Find(PyObject *str,
8311 PyObject *sub,
8312 Py_ssize_t start,
8313 Py_ssize_t end,
8314 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008316 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008317
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008321 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008322 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 Py_DECREF(str);
8324 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 }
Tim Petersced69f82003-09-16 20:30:58 +00008326
Thomas Wouters477c8d52006-05-27 19:21:47 +00008327 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 result = any_find_slice(
8329 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8330 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008331 );
8332 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 result = any_find_slice(
8334 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8335 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008336 );
8337
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008339 Py_DECREF(sub);
8340
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 return result;
8342}
8343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344Py_ssize_t
8345PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8346 Py_ssize_t start, Py_ssize_t end,
8347 int direction)
8348{
8349 char *result;
8350 int kind;
8351 if (PyUnicode_READY(str) == -1)
8352 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008353 if (start < 0 || end < 0) {
8354 PyErr_SetString(PyExc_IndexError, "string index out of range");
8355 return -2;
8356 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 if (end > PyUnicode_GET_LENGTH(str))
8358 end = PyUnicode_GET_LENGTH(str);
8359 kind = PyUnicode_KIND(str);
8360 result = findchar(PyUnicode_1BYTE_DATA(str)
8361 + PyUnicode_KIND_SIZE(kind, start),
8362 kind,
8363 end-start, ch, direction);
8364 if (!result)
8365 return -1;
8366 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8367}
8368
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369static int
8370tailmatch(PyUnicodeObject *self,
8371 PyUnicodeObject *substring,
8372 Py_ssize_t start,
8373 Py_ssize_t end,
8374 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 int kind_self;
8377 int kind_sub;
8378 void *data_self;
8379 void *data_sub;
8380 Py_ssize_t offset;
8381 Py_ssize_t i;
8382 Py_ssize_t end_sub;
8383
8384 if (PyUnicode_READY(self) == -1 ||
8385 PyUnicode_READY(substring) == -1)
8386 return 0;
8387
8388 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 return 1;
8390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8392 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 kind_self = PyUnicode_KIND(self);
8397 data_self = PyUnicode_DATA(self);
8398 kind_sub = PyUnicode_KIND(substring);
8399 data_sub = PyUnicode_DATA(substring);
8400 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8401
8402 if (direction > 0)
8403 offset = end;
8404 else
8405 offset = start;
8406
8407 if (PyUnicode_READ(kind_self, data_self, offset) ==
8408 PyUnicode_READ(kind_sub, data_sub, 0) &&
8409 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8410 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8411 /* If both are of the same kind, memcmp is sufficient */
8412 if (kind_self == kind_sub) {
8413 return ! memcmp((char *)data_self +
8414 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8415 data_sub,
8416 PyUnicode_GET_LENGTH(substring) *
8417 PyUnicode_CHARACTER_SIZE(substring));
8418 }
8419 /* otherwise we have to compare each character by first accesing it */
8420 else {
8421 /* We do not need to compare 0 and len(substring)-1 because
8422 the if statement above ensured already that they are equal
8423 when we end up here. */
8424 // TODO: honor direction and do a forward or backwards search
8425 for (i = 1; i < end_sub; ++i) {
8426 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8427 PyUnicode_READ(kind_sub, data_sub, i))
8428 return 0;
8429 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 }
8433
8434 return 0;
8435}
8436
Alexander Belopolsky40018472011-02-26 01:02:56 +00008437Py_ssize_t
8438PyUnicode_Tailmatch(PyObject *str,
8439 PyObject *substr,
8440 Py_ssize_t start,
8441 Py_ssize_t end,
8442 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008444 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008445
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 str = PyUnicode_FromObject(str);
8447 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 substr = PyUnicode_FromObject(substr);
8450 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_DECREF(str);
8452 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
Tim Petersced69f82003-09-16 20:30:58 +00008454
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 (PyUnicodeObject *)substr,
8457 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008458 Py_DECREF(str);
8459 Py_DECREF(substr);
8460 return result;
8461}
8462
Guido van Rossumd57fd912000-03-10 22:53:23 +00008463/* Apply fixfct filter to the Unicode object self and return a
8464 reference to the modified object */
8465
Alexander Belopolsky40018472011-02-26 01:02:56 +00008466static PyObject *
8467fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 PyObject *u;
8471 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 if (PyUnicode_READY(self) == -1)
8474 return NULL;
8475 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8476 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8477 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8482 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 /* fix functions return the new maximum character in a string,
8485 if the kind of the resulting unicode object does not change,
8486 everything is fine. Otherwise we need to change the string kind
8487 and re-run the fix function. */
8488 maxchar_new = fixfct((PyUnicodeObject*)u);
8489 if (maxchar_new == 0)
8490 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8491 else if (maxchar_new <= 127)
8492 maxchar_new = 127;
8493 else if (maxchar_new <= 255)
8494 maxchar_new = 255;
8495 else if (maxchar_new <= 65535)
8496 maxchar_new = 65535;
8497 else
8498 maxchar_new = 1114111; /* 0x10ffff */
8499
8500 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 /* fixfct should return TRUE if it modified the buffer. If
8502 FALSE, return a reference to the original buffer instead
8503 (to save space, not time) */
8504 Py_INCREF(self);
8505 Py_DECREF(u);
8506 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 else if (maxchar_new == maxchar_old) {
8509 return u;
8510 }
8511 else {
8512 /* In case the maximum character changed, we need to
8513 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008514 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 if (v == NULL) {
8516 Py_DECREF(u);
8517 return NULL;
8518 }
8519 if (maxchar_new > maxchar_old) {
8520 /* If the maxchar increased so that the kind changed, not all
8521 characters are representable anymore and we need to fix the
8522 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008523 if (PyUnicode_CopyCharacters(v, 0,
8524 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008525 PyUnicode_GET_LENGTH(self)) < 0)
8526 {
8527 Py_DECREF(u);
8528 return NULL;
8529 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 maxchar_old = fixfct((PyUnicodeObject*)v);
8531 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8532 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008533 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008534 if (PyUnicode_CopyCharacters(v, 0,
8535 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008536 PyUnicode_GET_LENGTH(self)) < 0)
8537 {
8538 Py_DECREF(u);
8539 return NULL;
8540 }
8541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542
8543 Py_DECREF(u);
8544 return v;
8545 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546}
8547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008549fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 /* No need to call PyUnicode_READY(self) because this function is only
8552 called as a callback from fixup() which does it already. */
8553 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8554 const int kind = PyUnicode_KIND(self);
8555 void *data = PyUnicode_DATA(self);
8556 int touched = 0;
8557 Py_UCS4 maxchar = 0;
8558 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 for (i = 0; i < len; ++i) {
8561 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8562 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8563 if (up != ch) {
8564 if (up > maxchar)
8565 maxchar = up;
8566 PyUnicode_WRITE(kind, data, i, up);
8567 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 else if (ch > maxchar)
8570 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
8572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 if (touched)
8574 return maxchar;
8575 else
8576 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577}
8578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008580fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8583 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8584 const int kind = PyUnicode_KIND(self);
8585 void *data = PyUnicode_DATA(self);
8586 int touched = 0;
8587 Py_UCS4 maxchar = 0;
8588 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008589
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 for(i = 0; i < len; ++i) {
8591 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8592 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8593 if (lo != ch) {
8594 if (lo > maxchar)
8595 maxchar = lo;
8596 PyUnicode_WRITE(kind, data, i, lo);
8597 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 else if (ch > maxchar)
8600 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601 }
8602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 if (touched)
8604 return maxchar;
8605 else
8606 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607}
8608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008610fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8613 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8614 const int kind = PyUnicode_KIND(self);
8615 void *data = PyUnicode_DATA(self);
8616 int touched = 0;
8617 Py_UCS4 maxchar = 0;
8618 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 for(i = 0; i < len; ++i) {
8621 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8622 Py_UCS4 nu = 0;
8623
8624 if (Py_UNICODE_ISUPPER(ch))
8625 nu = Py_UNICODE_TOLOWER(ch);
8626 else if (Py_UNICODE_ISLOWER(ch))
8627 nu = Py_UNICODE_TOUPPER(ch);
8628
8629 if (nu != 0) {
8630 if (nu > maxchar)
8631 maxchar = nu;
8632 PyUnicode_WRITE(kind, data, i, nu);
8633 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 else if (ch > maxchar)
8636 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637 }
8638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 if (touched)
8640 return maxchar;
8641 else
8642 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643}
8644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008646fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8649 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8650 const int kind = PyUnicode_KIND(self);
8651 void *data = PyUnicode_DATA(self);
8652 int touched = 0;
8653 Py_UCS4 maxchar = 0;
8654 Py_ssize_t i = 0;
8655 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008656
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008657 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659
8660 ch = PyUnicode_READ(kind, data, i);
8661 if (!Py_UNICODE_ISUPPER(ch)) {
8662 maxchar = Py_UNICODE_TOUPPER(ch);
8663 PyUnicode_WRITE(kind, data, i, maxchar);
8664 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 ++i;
8667 for(; i < len; ++i) {
8668 ch = PyUnicode_READ(kind, data, i);
8669 if (!Py_UNICODE_ISLOWER(ch)) {
8670 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8671 if (lo > maxchar)
8672 maxchar = lo;
8673 PyUnicode_WRITE(kind, data, i, lo);
8674 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 else if (ch > maxchar)
8677 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679
8680 if (touched)
8681 return maxchar;
8682 else
8683 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684}
8685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008687fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8690 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8691 const int kind = PyUnicode_KIND(self);
8692 void *data = PyUnicode_DATA(self);
8693 Py_UCS4 maxchar = 0;
8694 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695 int previous_is_cased;
8696
8697 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 if (len == 1) {
8699 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8700 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8701 if (ti != ch) {
8702 PyUnicode_WRITE(kind, data, i, ti);
8703 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 }
8705 else
8706 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008709 for(; i < len; ++i) {
8710 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8711 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008712
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 nu = Py_UNICODE_TOTITLE(ch);
8717
8718 if (nu > maxchar)
8719 maxchar = nu;
8720 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008721
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 if (Py_UNICODE_ISLOWER(ch) ||
8723 Py_UNICODE_ISUPPER(ch) ||
8724 Py_UNICODE_ISTITLE(ch))
8725 previous_is_cased = 1;
8726 else
8727 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730}
8731
Tim Peters8ce9f162004-08-27 01:49:32 +00008732PyObject *
8733PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008736 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008738 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008739 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8740 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008741 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 Py_ssize_t sz, i, res_offset;
8743 Py_UCS4 maxchar = 0;
8744 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
Tim Peters05eba1f2004-08-27 21:32:02 +00008746 fseq = PySequence_Fast(seq, "");
8747 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008748 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008749 }
8750
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008751 /* NOTE: the following code can't call back into Python code,
8752 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008753 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008754
Tim Peters05eba1f2004-08-27 21:32:02 +00008755 seqlen = PySequence_Fast_GET_SIZE(fseq);
8756 /* If empty sequence, return u"". */
8757 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008759 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008760 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008761 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008762 /* If singleton sequence with an exact Unicode, return that. */
8763 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 item = items[0];
8765 if (PyUnicode_CheckExact(item)) {
8766 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 goto Done;
8769 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008770 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008771 else {
8772 /* Set up sep and seplen */
8773 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 /* fall back to a blank space separator */
8775 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008776 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008778 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008779 else {
8780 if (!PyUnicode_Check(separator)) {
8781 PyErr_Format(PyExc_TypeError,
8782 "separator: expected str instance,"
8783 " %.80s found",
8784 Py_TYPE(separator)->tp_name);
8785 goto onError;
8786 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 if (PyUnicode_READY(separator) == -1)
8788 goto onError;
8789 sep = separator;
8790 seplen = PyUnicode_GET_LENGTH(separator);
8791 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8792 /* inc refcount to keep this code path symetric with the
8793 above case of a blank separator */
8794 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008795 }
8796 }
8797
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008798 /* There are at least two things to join, or else we have a subclass
8799 * of str in the sequence.
8800 * Do a pre-pass to figure out the total amount of space we'll
8801 * need (sz), and see whether all argument are strings.
8802 */
8803 sz = 0;
8804 for (i = 0; i < seqlen; i++) {
8805 const Py_ssize_t old_sz = sz;
8806 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 if (!PyUnicode_Check(item)) {
8808 PyErr_Format(PyExc_TypeError,
8809 "sequence item %zd: expected str instance,"
8810 " %.80s found",
8811 i, Py_TYPE(item)->tp_name);
8812 goto onError;
8813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 if (PyUnicode_READY(item) == -1)
8815 goto onError;
8816 sz += PyUnicode_GET_LENGTH(item);
8817 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8818 if (item_maxchar > maxchar)
8819 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008820 if (i != 0)
8821 sz += seplen;
8822 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8823 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008825 goto onError;
8826 }
8827 }
Tim Petersced69f82003-09-16 20:30:58 +00008828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008830 if (res == NULL)
8831 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008832
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008833 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008835 Py_ssize_t itemlen;
8836 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 /* Copy item, and maybe the separator. */
8839 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008840 if (PyUnicode_CopyCharacters(res, res_offset,
8841 sep, 0, seplen) < 0)
8842 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008843 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008845 if (PyUnicode_CopyCharacters(res, res_offset,
8846 item, 0, itemlen) < 0)
8847 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008851
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008853 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 Py_XDECREF(sep);
8855 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008856
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008858 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008860 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861 return NULL;
8862}
8863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864#define FILL(kind, data, value, start, length) \
8865 do { \
8866 Py_ssize_t i_ = 0; \
8867 assert(kind != PyUnicode_WCHAR_KIND); \
8868 switch ((kind)) { \
8869 case PyUnicode_1BYTE_KIND: { \
8870 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8871 memset(to_, (unsigned char)value, length); \
8872 break; \
8873 } \
8874 case PyUnicode_2BYTE_KIND: { \
8875 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8876 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8877 break; \
8878 } \
8879 default: { \
8880 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8881 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8882 break; \
8883 } \
8884 } \
8885 } while (0)
8886
Alexander Belopolsky40018472011-02-26 01:02:56 +00008887static PyUnicodeObject *
8888pad(PyUnicodeObject *self,
8889 Py_ssize_t left,
8890 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 PyObject *u;
8894 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008895 int kind;
8896 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897
8898 if (left < 0)
8899 left = 0;
8900 if (right < 0)
8901 right = 0;
8902
Tim Peters7a29bd52001-09-12 03:03:31 +00008903 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904 Py_INCREF(self);
8905 return self;
8906 }
8907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8909 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008910 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8911 return NULL;
8912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8914 if (fill > maxchar)
8915 maxchar = fill;
8916 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008917 if (!u)
8918 return NULL;
8919
8920 kind = PyUnicode_KIND(u);
8921 data = PyUnicode_DATA(u);
8922 if (left)
8923 FILL(kind, data, fill, 0, left);
8924 if (right)
8925 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008926 if (PyUnicode_CopyCharacters(u, left,
8927 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008928 _PyUnicode_LENGTH(self)) < 0)
8929 {
8930 Py_DECREF(u);
8931 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 }
8933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
Alexander Belopolsky40018472011-02-26 01:02:56 +00008938PyObject *
8939PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942
8943 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 switch(PyUnicode_KIND(string)) {
8948 case PyUnicode_1BYTE_KIND:
8949 list = ucs1lib_splitlines(
8950 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8951 PyUnicode_GET_LENGTH(string), keepends);
8952 break;
8953 case PyUnicode_2BYTE_KIND:
8954 list = ucs2lib_splitlines(
8955 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8956 PyUnicode_GET_LENGTH(string), keepends);
8957 break;
8958 case PyUnicode_4BYTE_KIND:
8959 list = ucs4lib_splitlines(
8960 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8961 PyUnicode_GET_LENGTH(string), keepends);
8962 break;
8963 default:
8964 assert(0);
8965 list = 0;
8966 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 Py_DECREF(string);
8968 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969}
8970
Alexander Belopolsky40018472011-02-26 01:02:56 +00008971static PyObject *
8972split(PyUnicodeObject *self,
8973 PyUnicodeObject *substring,
8974 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 int kind1, kind2, kind;
8977 void *buf1, *buf2;
8978 Py_ssize_t len1, len2;
8979 PyObject* out;
8980
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008982 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 if (PyUnicode_READY(self) == -1)
8985 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987 if (substring == NULL)
8988 switch(PyUnicode_KIND(self)) {
8989 case PyUnicode_1BYTE_KIND:
8990 return ucs1lib_split_whitespace(
8991 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8992 PyUnicode_GET_LENGTH(self), maxcount
8993 );
8994 case PyUnicode_2BYTE_KIND:
8995 return ucs2lib_split_whitespace(
8996 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8997 PyUnicode_GET_LENGTH(self), maxcount
8998 );
8999 case PyUnicode_4BYTE_KIND:
9000 return ucs4lib_split_whitespace(
9001 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9002 PyUnicode_GET_LENGTH(self), maxcount
9003 );
9004 default:
9005 assert(0);
9006 return NULL;
9007 }
9008
9009 if (PyUnicode_READY(substring) == -1)
9010 return NULL;
9011
9012 kind1 = PyUnicode_KIND(self);
9013 kind2 = PyUnicode_KIND(substring);
9014 kind = kind1 > kind2 ? kind1 : kind2;
9015 buf1 = PyUnicode_DATA(self);
9016 buf2 = PyUnicode_DATA(substring);
9017 if (kind1 != kind)
9018 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9019 if (!buf1)
9020 return NULL;
9021 if (kind2 != kind)
9022 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9023 if (!buf2) {
9024 if (kind1 != kind) PyMem_Free(buf1);
9025 return NULL;
9026 }
9027 len1 = PyUnicode_GET_LENGTH(self);
9028 len2 = PyUnicode_GET_LENGTH(substring);
9029
9030 switch(kind) {
9031 case PyUnicode_1BYTE_KIND:
9032 out = ucs1lib_split(
9033 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9034 break;
9035 case PyUnicode_2BYTE_KIND:
9036 out = ucs2lib_split(
9037 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9038 break;
9039 case PyUnicode_4BYTE_KIND:
9040 out = ucs4lib_split(
9041 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9042 break;
9043 default:
9044 out = NULL;
9045 }
9046 if (kind1 != kind)
9047 PyMem_Free(buf1);
9048 if (kind2 != kind)
9049 PyMem_Free(buf2);
9050 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051}
9052
Alexander Belopolsky40018472011-02-26 01:02:56 +00009053static PyObject *
9054rsplit(PyUnicodeObject *self,
9055 PyUnicodeObject *substring,
9056 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 int kind1, kind2, kind;
9059 void *buf1, *buf2;
9060 Py_ssize_t len1, len2;
9061 PyObject* out;
9062
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009063 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009064 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 if (PyUnicode_READY(self) == -1)
9067 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 if (substring == NULL)
9070 switch(PyUnicode_KIND(self)) {
9071 case PyUnicode_1BYTE_KIND:
9072 return ucs1lib_rsplit_whitespace(
9073 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9074 PyUnicode_GET_LENGTH(self), maxcount
9075 );
9076 case PyUnicode_2BYTE_KIND:
9077 return ucs2lib_rsplit_whitespace(
9078 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9079 PyUnicode_GET_LENGTH(self), maxcount
9080 );
9081 case PyUnicode_4BYTE_KIND:
9082 return ucs4lib_rsplit_whitespace(
9083 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9084 PyUnicode_GET_LENGTH(self), maxcount
9085 );
9086 default:
9087 assert(0);
9088 return NULL;
9089 }
9090
9091 if (PyUnicode_READY(substring) == -1)
9092 return NULL;
9093
9094 kind1 = PyUnicode_KIND(self);
9095 kind2 = PyUnicode_KIND(substring);
9096 kind = kind1 > kind2 ? kind1 : kind2;
9097 buf1 = PyUnicode_DATA(self);
9098 buf2 = PyUnicode_DATA(substring);
9099 if (kind1 != kind)
9100 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9101 if (!buf1)
9102 return NULL;
9103 if (kind2 != kind)
9104 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9105 if (!buf2) {
9106 if (kind1 != kind) PyMem_Free(buf1);
9107 return NULL;
9108 }
9109 len1 = PyUnicode_GET_LENGTH(self);
9110 len2 = PyUnicode_GET_LENGTH(substring);
9111
9112 switch(kind) {
9113 case PyUnicode_1BYTE_KIND:
9114 out = ucs1lib_rsplit(
9115 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9116 break;
9117 case PyUnicode_2BYTE_KIND:
9118 out = ucs2lib_rsplit(
9119 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9120 break;
9121 case PyUnicode_4BYTE_KIND:
9122 out = ucs4lib_rsplit(
9123 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9124 break;
9125 default:
9126 out = NULL;
9127 }
9128 if (kind1 != kind)
9129 PyMem_Free(buf1);
9130 if (kind2 != kind)
9131 PyMem_Free(buf2);
9132 return out;
9133}
9134
9135static Py_ssize_t
9136anylib_find(int kind, void *buf1, Py_ssize_t len1,
9137 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9138{
9139 switch(kind) {
9140 case PyUnicode_1BYTE_KIND:
9141 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9142 case PyUnicode_2BYTE_KIND:
9143 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9144 case PyUnicode_4BYTE_KIND:
9145 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9146 }
9147 assert(0);
9148 return -1;
9149}
9150
9151static Py_ssize_t
9152anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9153 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9154{
9155 switch(kind) {
9156 case PyUnicode_1BYTE_KIND:
9157 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9158 case PyUnicode_2BYTE_KIND:
9159 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9160 case PyUnicode_4BYTE_KIND:
9161 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9162 }
9163 assert(0);
9164 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009165}
9166
Alexander Belopolsky40018472011-02-26 01:02:56 +00009167static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168replace(PyObject *self, PyObject *str1,
9169 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 PyObject *u;
9172 char *sbuf = PyUnicode_DATA(self);
9173 char *buf1 = PyUnicode_DATA(str1);
9174 char *buf2 = PyUnicode_DATA(str2);
9175 int srelease = 0, release1 = 0, release2 = 0;
9176 int skind = PyUnicode_KIND(self);
9177 int kind1 = PyUnicode_KIND(str1);
9178 int kind2 = PyUnicode_KIND(str2);
9179 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9180 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9181 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182
9183 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009186 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 if (skind < kind1)
9189 /* substring too wide to be present */
9190 goto nothing;
9191
9192 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009193 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009194 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009195 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009196 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009198 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 Py_UCS4 u1, u2, maxchar;
9200 int mayshrink, rkind;
9201 u1 = PyUnicode_READ_CHAR(str1, 0);
9202 if (!findchar(sbuf, PyUnicode_KIND(self),
9203 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009204 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205 u2 = PyUnicode_READ_CHAR(str2, 0);
9206 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9207 /* Replacing u1 with u2 may cause a maxchar reduction in the
9208 result string. */
9209 mayshrink = maxchar > 127;
9210 if (u2 > maxchar) {
9211 maxchar = u2;
9212 mayshrink = 0;
9213 }
9214 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009217 if (PyUnicode_CopyCharacters(u, 0,
9218 (PyObject*)self, 0, slen) < 0)
9219 {
9220 Py_DECREF(u);
9221 return NULL;
9222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 rkind = PyUnicode_KIND(u);
9224 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9225 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009226 if (--maxcount < 0)
9227 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 if (mayshrink) {
9231 PyObject *tmp = u;
9232 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9233 PyUnicode_GET_LENGTH(tmp));
9234 Py_DECREF(tmp);
9235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 int rkind = skind;
9238 char *res;
9239 if (kind1 < rkind) {
9240 /* widen substring */
9241 buf1 = _PyUnicode_AsKind(str1, rkind);
9242 if (!buf1) goto error;
9243 release1 = 1;
9244 }
9245 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009246 if (i < 0)
9247 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (rkind > kind2) {
9249 /* widen replacement */
9250 buf2 = _PyUnicode_AsKind(str2, rkind);
9251 if (!buf2) goto error;
9252 release2 = 1;
9253 }
9254 else if (rkind < kind2) {
9255 /* widen self and buf1 */
9256 rkind = kind2;
9257 if (release1) PyMem_Free(buf1);
9258 sbuf = _PyUnicode_AsKind(self, rkind);
9259 if (!sbuf) goto error;
9260 srelease = 1;
9261 buf1 = _PyUnicode_AsKind(str1, rkind);
9262 if (!buf1) goto error;
9263 release1 = 1;
9264 }
9265 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9266 if (!res) {
9267 PyErr_NoMemory();
9268 goto error;
9269 }
9270 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009271 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9273 buf2,
9274 PyUnicode_KIND_SIZE(rkind, len2));
9275 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009276
9277 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9279 slen-i,
9280 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009281 if (i == -1)
9282 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9284 buf2,
9285 PyUnicode_KIND_SIZE(rkind, len2));
9286 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288
9289 u = PyUnicode_FromKindAndData(rkind, res, slen);
9290 PyMem_Free(res);
9291 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 Py_ssize_t n, i, j, ires;
9296 Py_ssize_t product, new_size;
9297 int rkind = skind;
9298 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300 if (kind1 < rkind) {
9301 buf1 = _PyUnicode_AsKind(str1, rkind);
9302 if (!buf1) goto error;
9303 release1 = 1;
9304 }
9305 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009306 if (n == 0)
9307 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 if (kind2 < rkind) {
9309 buf2 = _PyUnicode_AsKind(str2, rkind);
9310 if (!buf2) goto error;
9311 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 else if (kind2 > rkind) {
9314 rkind = kind2;
9315 sbuf = _PyUnicode_AsKind(self, rkind);
9316 if (!sbuf) goto error;
9317 srelease = 1;
9318 if (release1) PyMem_Free(buf1);
9319 buf1 = _PyUnicode_AsKind(str1, rkind);
9320 if (!buf1) goto error;
9321 release1 = 1;
9322 }
9323 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9324 PyUnicode_GET_LENGTH(str1))); */
9325 product = n * (len2-len1);
9326 if ((product / (len2-len1)) != n) {
9327 PyErr_SetString(PyExc_OverflowError,
9328 "replace string is too long");
9329 goto error;
9330 }
9331 new_size = slen + product;
9332 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9333 PyErr_SetString(PyExc_OverflowError,
9334 "replace string is too long");
9335 goto error;
9336 }
9337 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9338 if (!res)
9339 goto error;
9340 ires = i = 0;
9341 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009342 while (n-- > 0) {
9343 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 j = anylib_find(rkind,
9345 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9346 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009347 if (j == -1)
9348 break;
9349 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009350 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009351 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9352 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9353 PyUnicode_KIND_SIZE(rkind, j-i));
9354 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009355 }
9356 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 if (len2 > 0) {
9358 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9359 buf2,
9360 PyUnicode_KIND_SIZE(rkind, len2));
9361 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009366 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9368 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9369 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009370 } else {
9371 /* interleave */
9372 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009373 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9374 buf2,
9375 PyUnicode_KIND_SIZE(rkind, len2));
9376 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009377 if (--n <= 0)
9378 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9380 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9381 PyUnicode_KIND_SIZE(rkind, 1));
9382 ires++;
9383 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9386 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9387 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009390 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 if (srelease)
9393 PyMem_FREE(sbuf);
9394 if (release1)
9395 PyMem_FREE(buf1);
9396 if (release2)
9397 PyMem_FREE(buf2);
9398 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009399
Benjamin Peterson29060642009-01-31 22:14:21 +00009400 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009401 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (srelease)
9403 PyMem_FREE(sbuf);
9404 if (release1)
9405 PyMem_FREE(buf1);
9406 if (release2)
9407 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009408 if (PyUnicode_CheckExact(self)) {
9409 Py_INCREF(self);
9410 return (PyObject *) self;
9411 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009412 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 error:
9414 if (srelease && sbuf)
9415 PyMem_FREE(sbuf);
9416 if (release1 && buf1)
9417 PyMem_FREE(buf1);
9418 if (release2 && buf2)
9419 PyMem_FREE(buf2);
9420 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421}
9422
9423/* --- Unicode Object Methods --------------------------------------------- */
9424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009425PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427\n\
9428Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009429characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430
9431static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009432unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 return fixup(self, fixtitle);
9435}
9436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009437PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439\n\
9440Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009441have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442
9443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009444unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 return fixup(self, fixcapitalize);
9447}
9448
9449#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009450PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452\n\
9453Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009454normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455
9456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009457unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458{
9459 PyObject *list;
9460 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009461 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463 /* Split into words */
9464 list = split(self, NULL, -1);
9465 if (!list)
9466 return NULL;
9467
9468 /* Capitalize each word */
9469 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9470 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 if (item == NULL)
9473 goto onError;
9474 Py_DECREF(PyList_GET_ITEM(list, i));
9475 PyList_SET_ITEM(list, i, item);
9476 }
9477
9478 /* Join the words to form a new string */
9479 item = PyUnicode_Join(NULL, list);
9480
Benjamin Peterson29060642009-01-31 22:14:21 +00009481 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482 Py_DECREF(list);
9483 return (PyObject *)item;
9484}
9485#endif
9486
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009487/* Argument converter. Coerces to a single unicode character */
9488
9489static int
9490convert_uc(PyObject *obj, void *addr)
9491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009493 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009494
Benjamin Peterson14339b62009-01-31 16:36:08 +00009495 uniobj = PyUnicode_FromObject(obj);
9496 if (uniobj == NULL) {
9497 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 return 0;
9500 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009504 Py_DECREF(uniobj);
9505 return 0;
9506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009507 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009508 Py_DECREF(uniobj);
9509 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009510}
9511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009512PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009515Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009516done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517
9518static PyObject *
9519unicode_center(PyUnicodeObject *self, PyObject *args)
9520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009521 Py_ssize_t marg, left;
9522 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 Py_UCS4 fillchar = ' ';
9524
Victor Stinnere9a29352011-10-01 02:14:59 +02009525 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527
Victor Stinnere9a29352011-10-01 02:14:59 +02009528 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 return NULL;
9530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532 Py_INCREF(self);
9533 return (PyObject*) self;
9534 }
9535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537 left = marg / 2 + (marg & width & 1);
9538
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009539 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540}
9541
Marc-André Lemburge5034372000-08-08 08:04:29 +00009542#if 0
9543
9544/* This code should go into some future Unicode collation support
9545 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009546 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009547
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009548/* speedy UTF-16 code point order comparison */
9549/* gleaned from: */
9550/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9551
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009552static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009553{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009554 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009555 0, 0, 0, 0, 0, 0, 0, 0,
9556 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009557 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009558};
9559
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560static int
9561unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9562{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009563 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009564
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 Py_UNICODE *s1 = str1->str;
9566 Py_UNICODE *s2 = str2->str;
9567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 len1 = str1->_base._base.length;
9569 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009570
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009572 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009573
9574 c1 = *s1++;
9575 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009576
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 if (c1 > (1<<11) * 26)
9578 c1 += utf16Fixup[c1>>11];
9579 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009580 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009581 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009582
9583 if (c1 != c2)
9584 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009585
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009586 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 }
9588
9589 return (len1 < len2) ? -1 : (len1 != len2);
9590}
9591
Marc-André Lemburge5034372000-08-08 08:04:29 +00009592#else
9593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594/* This function assumes that str1 and str2 are readied by the caller. */
9595
Marc-André Lemburge5034372000-08-08 08:04:29 +00009596static int
9597unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 int kind1, kind2;
9600 void *data1, *data2;
9601 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 kind1 = PyUnicode_KIND(str1);
9604 kind2 = PyUnicode_KIND(str2);
9605 data1 = PyUnicode_DATA(str1);
9606 data2 = PyUnicode_DATA(str2);
9607 len1 = PyUnicode_GET_LENGTH(str1);
9608 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 for (i = 0; i < len1 && i < len2; ++i) {
9611 Py_UCS4 c1, c2;
9612 c1 = PyUnicode_READ(kind1, data1, i);
9613 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009614
9615 if (c1 != c2)
9616 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009617 }
9618
9619 return (len1 < len2) ? -1 : (len1 != len2);
9620}
9621
9622#endif
9623
Alexander Belopolsky40018472011-02-26 01:02:56 +00009624int
9625PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9628 if (PyUnicode_READY(left) == -1 ||
9629 PyUnicode_READY(right) == -1)
9630 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009631 return unicode_compare((PyUnicodeObject *)left,
9632 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009634 PyErr_Format(PyExc_TypeError,
9635 "Can't compare %.100s and %.100s",
9636 left->ob_type->tp_name,
9637 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009638 return -1;
9639}
9640
Martin v. Löwis5b222132007-06-10 09:51:05 +00009641int
9642PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 Py_ssize_t i;
9645 int kind;
9646 void *data;
9647 Py_UCS4 chr;
9648
Victor Stinner910337b2011-10-03 03:20:16 +02009649 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 if (PyUnicode_READY(uni) == -1)
9651 return -1;
9652 kind = PyUnicode_KIND(uni);
9653 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009654 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9656 if (chr != str[i])
9657 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009658 /* This check keeps Python strings that end in '\0' from comparing equal
9659 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009662 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009664 return 0;
9665}
9666
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009667
Benjamin Peterson29060642009-01-31 22:14:21 +00009668#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009669 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009670
Alexander Belopolsky40018472011-02-26 01:02:56 +00009671PyObject *
9672PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009673{
9674 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009675
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009676 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9677 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 if (PyUnicode_READY(left) == -1 ||
9679 PyUnicode_READY(right) == -1)
9680 return NULL;
9681 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9682 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009683 if (op == Py_EQ) {
9684 Py_INCREF(Py_False);
9685 return Py_False;
9686 }
9687 if (op == Py_NE) {
9688 Py_INCREF(Py_True);
9689 return Py_True;
9690 }
9691 }
9692 if (left == right)
9693 result = 0;
9694 else
9695 result = unicode_compare((PyUnicodeObject *)left,
9696 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009697
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009698 /* Convert the return value to a Boolean */
9699 switch (op) {
9700 case Py_EQ:
9701 v = TEST_COND(result == 0);
9702 break;
9703 case Py_NE:
9704 v = TEST_COND(result != 0);
9705 break;
9706 case Py_LE:
9707 v = TEST_COND(result <= 0);
9708 break;
9709 case Py_GE:
9710 v = TEST_COND(result >= 0);
9711 break;
9712 case Py_LT:
9713 v = TEST_COND(result == -1);
9714 break;
9715 case Py_GT:
9716 v = TEST_COND(result == 1);
9717 break;
9718 default:
9719 PyErr_BadArgument();
9720 return NULL;
9721 }
9722 Py_INCREF(v);
9723 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009725
Brian Curtindfc80e32011-08-10 20:28:54 -05009726 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009727}
9728
Alexander Belopolsky40018472011-02-26 01:02:56 +00009729int
9730PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009731{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009732 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 int kind1, kind2, kind;
9734 void *buf1, *buf2;
9735 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009736 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009737
9738 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009739 sub = PyUnicode_FromObject(element);
9740 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009741 PyErr_Format(PyExc_TypeError,
9742 "'in <string>' requires string as left operand, not %s",
9743 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009744 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 if (PyUnicode_READY(sub) == -1)
9747 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009748
Thomas Wouters477c8d52006-05-27 19:21:47 +00009749 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009750 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009751 Py_DECREF(sub);
9752 return -1;
9753 }
9754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 kind1 = PyUnicode_KIND(str);
9756 kind2 = PyUnicode_KIND(sub);
9757 kind = kind1 > kind2 ? kind1 : kind2;
9758 buf1 = PyUnicode_DATA(str);
9759 buf2 = PyUnicode_DATA(sub);
9760 if (kind1 != kind)
9761 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9762 if (!buf1) {
9763 Py_DECREF(sub);
9764 return -1;
9765 }
9766 if (kind2 != kind)
9767 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9768 if (!buf2) {
9769 Py_DECREF(sub);
9770 if (kind1 != kind) PyMem_Free(buf1);
9771 return -1;
9772 }
9773 len1 = PyUnicode_GET_LENGTH(str);
9774 len2 = PyUnicode_GET_LENGTH(sub);
9775
9776 switch(kind) {
9777 case PyUnicode_1BYTE_KIND:
9778 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9779 break;
9780 case PyUnicode_2BYTE_KIND:
9781 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9782 break;
9783 case PyUnicode_4BYTE_KIND:
9784 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9785 break;
9786 default:
9787 result = -1;
9788 assert(0);
9789 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009790
9791 Py_DECREF(str);
9792 Py_DECREF(sub);
9793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009794 if (kind1 != kind)
9795 PyMem_Free(buf1);
9796 if (kind2 != kind)
9797 PyMem_Free(buf2);
9798
Guido van Rossum403d68b2000-03-13 15:55:09 +00009799 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009800}
9801
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802/* Concat to string or Unicode object giving a new Unicode object. */
9803
Alexander Belopolsky40018472011-02-26 01:02:56 +00009804PyObject *
9805PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 PyObject *u = NULL, *v = NULL, *w;
9808 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809
9810 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817
9818 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009819 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009820 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009823 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009824 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 }
9827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009829 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 w = PyUnicode_New(
9833 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9834 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009836 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009837 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9838 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009839 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009840 v, 0,
9841 PyUnicode_GET_LENGTH(v)) < 0)
9842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843 Py_DECREF(u);
9844 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846
Benjamin Peterson29060642009-01-31 22:14:21 +00009847 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848 Py_XDECREF(u);
9849 Py_XDECREF(v);
9850 return NULL;
9851}
9852
Walter Dörwald1ab83302007-05-18 17:15:44 +00009853void
Victor Stinner23e56682011-10-03 03:54:37 +02009854PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009855{
Victor Stinner23e56682011-10-03 03:54:37 +02009856 PyObject *left, *res;
9857
9858 if (p_left == NULL) {
9859 if (!PyErr_Occurred())
9860 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 return;
9862 }
Victor Stinner23e56682011-10-03 03:54:37 +02009863 left = *p_left;
9864 if (right == NULL || !PyUnicode_Check(left)) {
9865 if (!PyErr_Occurred())
9866 PyErr_BadInternalCall();
9867 goto error;
9868 }
9869
9870 if (PyUnicode_CheckExact(left) && left != unicode_empty
9871 && PyUnicode_CheckExact(right) && right != unicode_empty
9872 && unicode_resizable(left)
9873 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9874 || _PyUnicode_WSTR(left) != NULL))
9875 {
9876 Py_ssize_t u_len, v_len, new_len, copied;
9877
9878 /* FIXME: don't make wstr string ready */
9879 if (PyUnicode_READY(left))
9880 goto error;
9881 if (PyUnicode_READY(right))
9882 goto error;
9883
9884 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9885 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9886 {
9887 u_len = PyUnicode_GET_LENGTH(left);
9888 v_len = PyUnicode_GET_LENGTH(right);
9889 if (u_len > PY_SSIZE_T_MAX - v_len) {
9890 PyErr_SetString(PyExc_OverflowError,
9891 "strings are too large to concat");
9892 goto error;
9893 }
9894 new_len = u_len + v_len;
9895
9896 /* Now we own the last reference to 'left', so we can resize it
9897 * in-place.
9898 */
9899 if (unicode_resize(&left, new_len) != 0) {
9900 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9901 * deallocated so it cannot be put back into
9902 * 'variable'. The MemoryError is raised when there
9903 * is no value in 'variable', which might (very
9904 * remotely) be a cause of incompatibilities.
9905 */
9906 goto error;
9907 }
9908 /* copy 'right' into the newly allocated area of 'left' */
9909 copied = PyUnicode_CopyCharacters(left, u_len,
9910 right, 0,
9911 v_len);
9912 assert(0 <= copied);
9913 *p_left = left;
9914 return;
9915 }
9916 }
9917
9918 res = PyUnicode_Concat(left, right);
9919 if (res == NULL)
9920 goto error;
9921 Py_DECREF(left);
9922 *p_left = res;
9923 return;
9924
9925error:
9926 Py_DECREF(*p_left);
9927 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009928}
9929
9930void
9931PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9932{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 PyUnicode_Append(pleft, right);
9934 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009935}
9936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009937PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009940Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009941string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009942interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943
9944static PyObject *
9945unicode_count(PyUnicodeObject *self, PyObject *args)
9946{
9947 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009948 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009949 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 int kind1, kind2, kind;
9952 void *buf1, *buf2;
9953 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954
Jesus Ceaac451502011-04-20 17:09:23 +02009955 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9956 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 kind1 = PyUnicode_KIND(self);
9960 kind2 = PyUnicode_KIND(substring);
9961 kind = kind1 > kind2 ? kind1 : kind2;
9962 buf1 = PyUnicode_DATA(self);
9963 buf2 = PyUnicode_DATA(substring);
9964 if (kind1 != kind)
9965 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9966 if (!buf1) {
9967 Py_DECREF(substring);
9968 return NULL;
9969 }
9970 if (kind2 != kind)
9971 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9972 if (!buf2) {
9973 Py_DECREF(substring);
9974 if (kind1 != kind) PyMem_Free(buf1);
9975 return NULL;
9976 }
9977 len1 = PyUnicode_GET_LENGTH(self);
9978 len2 = PyUnicode_GET_LENGTH(substring);
9979
9980 ADJUST_INDICES(start, end, len1);
9981 switch(kind) {
9982 case PyUnicode_1BYTE_KIND:
9983 iresult = ucs1lib_count(
9984 ((Py_UCS1*)buf1) + start, end - start,
9985 buf2, len2, PY_SSIZE_T_MAX
9986 );
9987 break;
9988 case PyUnicode_2BYTE_KIND:
9989 iresult = ucs2lib_count(
9990 ((Py_UCS2*)buf1) + start, end - start,
9991 buf2, len2, PY_SSIZE_T_MAX
9992 );
9993 break;
9994 case PyUnicode_4BYTE_KIND:
9995 iresult = ucs4lib_count(
9996 ((Py_UCS4*)buf1) + start, end - start,
9997 buf2, len2, PY_SSIZE_T_MAX
9998 );
9999 break;
10000 default:
10001 assert(0); iresult = 0;
10002 }
10003
10004 result = PyLong_FromSsize_t(iresult);
10005
10006 if (kind1 != kind)
10007 PyMem_Free(buf1);
10008 if (kind2 != kind)
10009 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010
10011 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010012
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013 return result;
10014}
10015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010016PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010017 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010019Encode S using the codec registered for encoding. Default encoding\n\
10020is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010021handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010022a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10023'xmlcharrefreplace' as well as any other name registered with\n\
10024codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
10026static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010027unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010029 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 char *encoding = NULL;
10031 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010032
Benjamin Peterson308d6372009-09-18 21:42:35 +000010033 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10034 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010036 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010037}
10038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010039PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041\n\
10042Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010043If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010044
10045static PyObject*
10046unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10047{
10048 Py_UNICODE *e;
10049 Py_UNICODE *p;
10050 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010051 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053 PyUnicodeObject *u;
10054 int tabsize = 8;
10055
10056 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10060 return NULL;
10061
Thomas Wouters7e474022000-07-16 12:04:32 +000010062 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010063 i = 0; /* chars up to and including most recent \n or \r */
10064 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10066 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010067 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 if (tabsize > 0) {
10069 incr = tabsize - (j % tabsize); /* cannot overflow */
10070 if (j > PY_SSIZE_T_MAX - incr)
10071 goto overflow1;
10072 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010073 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 if (j > PY_SSIZE_T_MAX - 1)
10077 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078 j++;
10079 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 if (i > PY_SSIZE_T_MAX - j)
10081 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010082 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010083 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084 }
10085 }
10086
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010087 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010088 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010089
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090 /* Second pass: create output string and fill it */
10091 u = _PyUnicode_New(i + j);
10092 if (!u)
10093 return NULL;
10094
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010095 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 q = _PyUnicode_WSTR(u); /* next output char */
10097 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 if (tabsize > 0) {
10102 i = tabsize - (j % tabsize);
10103 j += i;
10104 while (i--) {
10105 if (q >= qe)
10106 goto overflow2;
10107 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010108 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010110 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 else {
10112 if (q >= qe)
10113 goto overflow2;
10114 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010115 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116 if (*p == '\n' || *p == '\r')
10117 j = 0;
10118 }
10119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (PyUnicode_READY(u) == -1) {
10121 Py_DECREF(u);
10122 return NULL;
10123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010125
10126 overflow2:
10127 Py_DECREF(u);
10128 overflow1:
10129 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131}
10132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010133PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135\n\
10136Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010137such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138arguments start and end are interpreted as in slice notation.\n\
10139\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010140Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010144{
Jesus Ceaac451502011-04-20 17:09:23 +020010145 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010146 Py_ssize_t start;
10147 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010148 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
Jesus Ceaac451502011-04-20 17:09:23 +020010150 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10151 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (PyUnicode_READY(self) == -1)
10155 return NULL;
10156 if (PyUnicode_READY(substring) == -1)
10157 return NULL;
10158
10159 result = any_find_slice(
10160 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10161 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163
10164 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 if (result == -2)
10167 return NULL;
10168
Christian Heimes217cfd12007-12-02 14:31:20 +000010169 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170}
10171
10172static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010173unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010175 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10176 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010179}
10180
Guido van Rossumc2504932007-09-18 19:42:40 +000010181/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010182 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010183static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010184unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185{
Guido van Rossumc2504932007-09-18 19:42:40 +000010186 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010187 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (_PyUnicode_HASH(self) != -1)
10190 return _PyUnicode_HASH(self);
10191 if (PyUnicode_READY(self) == -1)
10192 return -1;
10193 len = PyUnicode_GET_LENGTH(self);
10194
10195 /* The hash function as a macro, gets expanded three times below. */
10196#define HASH(P) \
10197 x = (Py_uhash_t)*P << 7; \
10198 while (--len >= 0) \
10199 x = (1000003*x) ^ (Py_uhash_t)*P++;
10200
10201 switch (PyUnicode_KIND(self)) {
10202 case PyUnicode_1BYTE_KIND: {
10203 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10204 HASH(c);
10205 break;
10206 }
10207 case PyUnicode_2BYTE_KIND: {
10208 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10209 HASH(s);
10210 break;
10211 }
10212 default: {
10213 Py_UCS4 *l;
10214 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10215 "Impossible switch case in unicode_hash");
10216 l = PyUnicode_4BYTE_DATA(self);
10217 HASH(l);
10218 break;
10219 }
10220 }
10221 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10222
Guido van Rossumc2504932007-09-18 19:42:40 +000010223 if (x == -1)
10224 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010226 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010233Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234
10235static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010238 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010239 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010240 Py_ssize_t start;
10241 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
Jesus Ceaac451502011-04-20 17:09:23 +020010243 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10244 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 if (PyUnicode_READY(self) == -1)
10248 return NULL;
10249 if (PyUnicode_READY(substring) == -1)
10250 return NULL;
10251
10252 result = any_find_slice(
10253 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10254 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
10257 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (result == -2)
10260 return NULL;
10261
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 if (result < 0) {
10263 PyErr_SetString(PyExc_ValueError, "substring not found");
10264 return NULL;
10265 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010266
Christian Heimes217cfd12007-12-02 14:31:20 +000010267 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268}
10269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010270PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010271 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010273Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010274at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
10276static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010277unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 Py_ssize_t i, length;
10280 int kind;
10281 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282 int cased;
10283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (PyUnicode_READY(self) == -1)
10285 return NULL;
10286 length = PyUnicode_GET_LENGTH(self);
10287 kind = PyUnicode_KIND(self);
10288 data = PyUnicode_DATA(self);
10289
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 if (length == 1)
10292 return PyBool_FromLong(
10293 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010295 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010298
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 for (i = 0; i < length; i++) {
10301 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010302
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10304 return PyBool_FromLong(0);
10305 else if (!cased && Py_UNICODE_ISLOWER(ch))
10306 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010308 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309}
10310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010311PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010314Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010315at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316
10317static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010318unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 Py_ssize_t i, length;
10321 int kind;
10322 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323 int cased;
10324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 if (PyUnicode_READY(self) == -1)
10326 return NULL;
10327 length = PyUnicode_GET_LENGTH(self);
10328 kind = PyUnicode_KIND(self);
10329 data = PyUnicode_DATA(self);
10330
Guido van Rossumd57fd912000-03-10 22:53:23 +000010331 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (length == 1)
10333 return PyBool_FromLong(
10334 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010336 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010338 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010339
Guido van Rossumd57fd912000-03-10 22:53:23 +000010340 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 for (i = 0; i < length; i++) {
10342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010343
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10345 return PyBool_FromLong(0);
10346 else if (!cased && Py_UNICODE_ISUPPER(ch))
10347 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010349 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350}
10351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010352PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010355Return True if S is a titlecased string and there is at least one\n\
10356character in S, i.e. upper- and titlecase characters may only\n\
10357follow uncased characters and lowercase characters only cased ones.\n\
10358Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010361unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 Py_ssize_t i, length;
10364 int kind;
10365 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366 int cased, previous_is_cased;
10367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (PyUnicode_READY(self) == -1)
10369 return NULL;
10370 length = PyUnicode_GET_LENGTH(self);
10371 kind = PyUnicode_KIND(self);
10372 data = PyUnicode_DATA(self);
10373
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (length == 1) {
10376 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10377 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10378 (Py_UNICODE_ISUPPER(ch) != 0));
10379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010381 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010383 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010384
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 cased = 0;
10386 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 for (i = 0; i < length; i++) {
10388 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010389
Benjamin Peterson29060642009-01-31 22:14:21 +000010390 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10391 if (previous_is_cased)
10392 return PyBool_FromLong(0);
10393 previous_is_cased = 1;
10394 cased = 1;
10395 }
10396 else if (Py_UNICODE_ISLOWER(ch)) {
10397 if (!previous_is_cased)
10398 return PyBool_FromLong(0);
10399 previous_is_cased = 1;
10400 cased = 1;
10401 }
10402 else
10403 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010405 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406}
10407
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010408PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010409 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010411Return True if all characters in S are whitespace\n\
10412and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413
10414static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010415unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 Py_ssize_t i, length;
10418 int kind;
10419 void *data;
10420
10421 if (PyUnicode_READY(self) == -1)
10422 return NULL;
10423 length = PyUnicode_GET_LENGTH(self);
10424 kind = PyUnicode_KIND(self);
10425 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426
Guido van Rossumd57fd912000-03-10 22:53:23 +000010427 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 if (length == 1)
10429 return PyBool_FromLong(
10430 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010432 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010434 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 for (i = 0; i < length; i++) {
10437 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010438 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010441 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442}
10443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010444PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010446\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010447Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010448and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010449
10450static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010451unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 Py_ssize_t i, length;
10454 int kind;
10455 void *data;
10456
10457 if (PyUnicode_READY(self) == -1)
10458 return NULL;
10459 length = PyUnicode_GET_LENGTH(self);
10460 kind = PyUnicode_KIND(self);
10461 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010462
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010463 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (length == 1)
10465 return PyBool_FromLong(
10466 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010467
10468 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010470 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 for (i = 0; i < length; i++) {
10473 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010476 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010477}
10478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010479PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010480 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010482Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010483and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010484
10485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010486unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 int kind;
10489 void *data;
10490 Py_ssize_t len, i;
10491
10492 if (PyUnicode_READY(self) == -1)
10493 return NULL;
10494
10495 kind = PyUnicode_KIND(self);
10496 data = PyUnicode_DATA(self);
10497 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010498
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010499 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 if (len == 1) {
10501 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10502 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10503 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010504
10505 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 for (i = 0; i < len; i++) {
10510 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010511 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010512 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010513 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010514 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010515}
10516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010517PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010518 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010520Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010521False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
10523static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010524unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 Py_ssize_t i, length;
10527 int kind;
10528 void *data;
10529
10530 if (PyUnicode_READY(self) == -1)
10531 return NULL;
10532 length = PyUnicode_GET_LENGTH(self);
10533 kind = PyUnicode_KIND(self);
10534 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (length == 1)
10538 return PyBool_FromLong(
10539 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010541 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 for (i = 0; i < length; i++) {
10546 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010547 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010549 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550}
10551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010552PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010553 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010555Return True if all characters in S are digits\n\
10556and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557
10558static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010559unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 Py_ssize_t i, length;
10562 int kind;
10563 void *data;
10564
10565 if (PyUnicode_READY(self) == -1)
10566 return NULL;
10567 length = PyUnicode_GET_LENGTH(self);
10568 kind = PyUnicode_KIND(self);
10569 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (length == 1) {
10573 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10574 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010576
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010577 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 for (i = 0; i < length; i++) {
10582 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010585 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586}
10587
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010588PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010589 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010591Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010592False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
10594static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010595unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 Py_ssize_t i, length;
10598 int kind;
10599 void *data;
10600
10601 if (PyUnicode_READY(self) == -1)
10602 return NULL;
10603 length = PyUnicode_GET_LENGTH(self);
10604 kind = PyUnicode_KIND(self);
10605 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (length == 1)
10609 return PyBool_FromLong(
10610 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010612 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 for (i = 0; i < length; i++) {
10617 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621}
10622
Martin v. Löwis47383402007-08-15 07:32:56 +000010623int
10624PyUnicode_IsIdentifier(PyObject *self)
10625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 int kind;
10627 void *data;
10628 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010629 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (PyUnicode_READY(self) == -1) {
10632 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010633 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 }
10635
10636 /* Special case for empty strings */
10637 if (PyUnicode_GET_LENGTH(self) == 0)
10638 return 0;
10639 kind = PyUnicode_KIND(self);
10640 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010641
10642 /* PEP 3131 says that the first character must be in
10643 XID_Start and subsequent characters in XID_Continue,
10644 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010645 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010646 letters, digits, underscore). However, given the current
10647 definition of XID_Start and XID_Continue, it is sufficient
10648 to check just for these, except that _ must be allowed
10649 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010651 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010652 return 0;
10653
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010654 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010657 return 1;
10658}
10659
10660PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010661 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010662\n\
10663Return True if S is a valid identifier according\n\
10664to the language definition.");
10665
10666static PyObject*
10667unicode_isidentifier(PyObject *self)
10668{
10669 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10670}
10671
Georg Brandl559e5d72008-06-11 18:37:52 +000010672PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010674\n\
10675Return True if all characters in S are considered\n\
10676printable in repr() or S is empty, False otherwise.");
10677
10678static PyObject*
10679unicode_isprintable(PyObject *self)
10680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 Py_ssize_t i, length;
10682 int kind;
10683 void *data;
10684
10685 if (PyUnicode_READY(self) == -1)
10686 return NULL;
10687 length = PyUnicode_GET_LENGTH(self);
10688 kind = PyUnicode_KIND(self);
10689 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010690
10691 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (length == 1)
10693 return PyBool_FromLong(
10694 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 for (i = 0; i < length; i++) {
10697 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010698 Py_RETURN_FALSE;
10699 }
10700 }
10701 Py_RETURN_TRUE;
10702}
10703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010704PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010705 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706\n\
10707Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010708iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709
10710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010711unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010713 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714}
10715
Martin v. Löwis18e16552006-02-15 17:27:45 +000010716static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717unicode_length(PyUnicodeObject *self)
10718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (PyUnicode_READY(self) == -1)
10720 return -1;
10721 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010727Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010728done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject *
10731unicode_ljust(PyUnicodeObject *self, PyObject *args)
10732{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010733 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 Py_UCS4 fillchar = ' ';
10735
10736 if (PyUnicode_READY(self) == -1)
10737 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010738
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010739 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 return NULL;
10741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 Py_INCREF(self);
10744 return (PyObject*) self;
10745 }
10746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748}
10749
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010750PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010753Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754
10755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010756unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758 return fixup(self, fixlower);
10759}
10760
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010761#define LEFTSTRIP 0
10762#define RIGHTSTRIP 1
10763#define BOTHSTRIP 2
10764
10765/* Arrays indexed by above */
10766static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10767
10768#define STRIPNAME(i) (stripformat[i]+3)
10769
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010770/* externally visible for str.strip(unicode) */
10771PyObject *
10772_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 void *data;
10775 int kind;
10776 Py_ssize_t i, j, len;
10777 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10780 return NULL;
10781
10782 kind = PyUnicode_KIND(self);
10783 data = PyUnicode_DATA(self);
10784 len = PyUnicode_GET_LENGTH(self);
10785 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10786 PyUnicode_DATA(sepobj),
10787 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010788
Benjamin Peterson14339b62009-01-31 16:36:08 +000010789 i = 0;
10790 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 while (i < len &&
10792 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 i++;
10794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010796
Benjamin Peterson14339b62009-01-31 16:36:08 +000010797 j = len;
10798 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010799 do {
10800 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 } while (j >= i &&
10802 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010803 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010805
Victor Stinner12bab6d2011-10-01 01:53:49 +020010806 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807}
10808
10809PyObject*
10810PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10811{
10812 unsigned char *data;
10813 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010814 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815
Victor Stinnerde636f32011-10-01 03:55:54 +020010816 if (PyUnicode_READY(self) == -1)
10817 return NULL;
10818
10819 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10820
Victor Stinner12bab6d2011-10-01 01:53:49 +020010821 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010823 if (PyUnicode_CheckExact(self)) {
10824 Py_INCREF(self);
10825 return self;
10826 }
10827 else
10828 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 }
10830
Victor Stinner12bab6d2011-10-01 01:53:49 +020010831 length = end - start;
10832 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010833 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834
Victor Stinnerde636f32011-10-01 03:55:54 +020010835 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010836 PyErr_SetString(PyExc_IndexError, "string index out of range");
10837 return NULL;
10838 }
10839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 kind = PyUnicode_KIND(self);
10841 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010842 return PyUnicode_FromKindAndData(kind,
10843 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010844 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
10847static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010848do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 int kind;
10851 void *data;
10852 Py_ssize_t len, i, j;
10853
10854 if (PyUnicode_READY(self) == -1)
10855 return NULL;
10856
10857 kind = PyUnicode_KIND(self);
10858 data = PyUnicode_DATA(self);
10859 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010860
Benjamin Peterson14339b62009-01-31 16:36:08 +000010861 i = 0;
10862 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010864 i++;
10865 }
10866 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010867
Benjamin Peterson14339b62009-01-31 16:36:08 +000010868 j = len;
10869 if (striptype != LEFTSTRIP) {
10870 do {
10871 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010872 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010873 j++;
10874 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010875
Victor Stinner12bab6d2011-10-01 01:53:49 +020010876 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877}
10878
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010879
10880static PyObject *
10881do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10882{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010884
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10886 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010887
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888 if (sep != NULL && sep != Py_None) {
10889 if (PyUnicode_Check(sep))
10890 return _PyUnicode_XStrip(self, striptype, sep);
10891 else {
10892 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010893 "%s arg must be None or str",
10894 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010895 return NULL;
10896 }
10897 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010898
Benjamin Peterson14339b62009-01-31 16:36:08 +000010899 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010900}
10901
10902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010903PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010904 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010905\n\
10906Return a copy of the string S with leading and trailing\n\
10907whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010908If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010909
10910static PyObject *
10911unicode_strip(PyUnicodeObject *self, PyObject *args)
10912{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010913 if (PyTuple_GET_SIZE(args) == 0)
10914 return do_strip(self, BOTHSTRIP); /* Common case */
10915 else
10916 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010917}
10918
10919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010920PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010921 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010922\n\
10923Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010924If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010925
10926static PyObject *
10927unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10928{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010929 if (PyTuple_GET_SIZE(args) == 0)
10930 return do_strip(self, LEFTSTRIP); /* Common case */
10931 else
10932 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010933}
10934
10935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010936PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010937 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010938\n\
10939Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010940If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010941
10942static PyObject *
10943unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10944{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010945 if (PyTuple_GET_SIZE(args) == 0)
10946 return do_strip(self, RIGHTSTRIP); /* Common case */
10947 else
10948 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010949}
10950
10951
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010953unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954{
10955 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Georg Brandl222de0f2009-04-12 12:01:50 +000010958 if (len < 1) {
10959 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010960 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Tim Peters7a29bd52001-09-12 03:03:31 +000010963 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964 /* no repeat, return original string */
10965 Py_INCREF(str);
10966 return (PyObject*) str;
10967 }
Tim Peters8f422462000-09-09 06:13:41 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (PyUnicode_READY(str) == -1)
10970 return NULL;
10971
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010972 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010973 PyErr_SetString(PyExc_OverflowError,
10974 "repeated string is too long");
10975 return NULL;
10976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 if (!u)
10981 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010982 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 if (PyUnicode_GET_LENGTH(str) == 1) {
10985 const int kind = PyUnicode_KIND(str);
10986 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10987 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010988 if (kind == PyUnicode_1BYTE_KIND)
10989 memset(to, (unsigned char)fill_char, len);
10990 else {
10991 for (n = 0; n < len; ++n)
10992 PyUnicode_WRITE(kind, to, n, fill_char);
10993 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 }
10995 else {
10996 /* number of characters copied this far */
10997 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10998 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10999 char *to = (char *) PyUnicode_DATA(u);
11000 Py_MEMCPY(to, PyUnicode_DATA(str),
11001 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 n = (done <= nchars-done) ? done : nchars-done;
11004 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 }
11008
11009 return (PyObject*) u;
11010}
11011
Alexander Belopolsky40018472011-02-26 01:02:56 +000011012PyObject *
11013PyUnicode_Replace(PyObject *obj,
11014 PyObject *subobj,
11015 PyObject *replobj,
11016 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017{
11018 PyObject *self;
11019 PyObject *str1;
11020 PyObject *str2;
11021 PyObject *result;
11022
11023 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011024 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011027 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 Py_DECREF(self);
11029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 }
11031 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011032 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 Py_DECREF(self);
11034 Py_DECREF(str1);
11035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 Py_DECREF(self);
11039 Py_DECREF(str1);
11040 Py_DECREF(str2);
11041 return result;
11042}
11043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011044PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011045 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046\n\
11047Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011048old replaced by new. If the optional argument count is\n\
11049given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050
11051static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 PyObject *str1;
11055 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011056 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 PyObject *result;
11058
Martin v. Löwis18e16552006-02-15 17:27:45 +000011059 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011061 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 str1 = PyUnicode_FromObject(str1);
11064 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11065 return NULL;
11066 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011067 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011068 Py_DECREF(str1);
11069 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
11072 result = replace(self, str1, str2, maxcount);
11073
11074 Py_DECREF(str1);
11075 Py_DECREF(str2);
11076 return result;
11077}
11078
Alexander Belopolsky40018472011-02-26 01:02:56 +000011079static PyObject *
11080unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011082 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 Py_ssize_t isize;
11084 Py_ssize_t osize, squote, dquote, i, o;
11085 Py_UCS4 max, quote;
11086 int ikind, okind;
11087 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011090 return NULL;
11091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 isize = PyUnicode_GET_LENGTH(unicode);
11093 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 /* Compute length of output, quote characters, and
11096 maximum character */
11097 osize = 2; /* quotes */
11098 max = 127;
11099 squote = dquote = 0;
11100 ikind = PyUnicode_KIND(unicode);
11101 for (i = 0; i < isize; i++) {
11102 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11103 switch (ch) {
11104 case '\'': squote++; osize++; break;
11105 case '"': dquote++; osize++; break;
11106 case '\\': case '\t': case '\r': case '\n':
11107 osize += 2; break;
11108 default:
11109 /* Fast-path ASCII */
11110 if (ch < ' ' || ch == 0x7f)
11111 osize += 4; /* \xHH */
11112 else if (ch < 0x7f)
11113 osize++;
11114 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11115 osize++;
11116 max = ch > max ? ch : max;
11117 }
11118 else if (ch < 0x100)
11119 osize += 4; /* \xHH */
11120 else if (ch < 0x10000)
11121 osize += 6; /* \uHHHH */
11122 else
11123 osize += 10; /* \uHHHHHHHH */
11124 }
11125 }
11126
11127 quote = '\'';
11128 if (squote) {
11129 if (dquote)
11130 /* Both squote and dquote present. Use squote,
11131 and escape them */
11132 osize += squote;
11133 else
11134 quote = '"';
11135 }
11136
11137 repr = PyUnicode_New(osize, max);
11138 if (repr == NULL)
11139 return NULL;
11140 okind = PyUnicode_KIND(repr);
11141 odata = PyUnicode_DATA(repr);
11142
11143 PyUnicode_WRITE(okind, odata, 0, quote);
11144 PyUnicode_WRITE(okind, odata, osize-1, quote);
11145
11146 for (i = 0, o = 1; i < isize; i++) {
11147 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011148
11149 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 if ((ch == quote) || (ch == '\\')) {
11151 PyUnicode_WRITE(okind, odata, o++, '\\');
11152 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011153 continue;
11154 }
11155
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011157 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 PyUnicode_WRITE(okind, odata, o++, '\\');
11159 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011160 }
11161 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 PyUnicode_WRITE(okind, odata, o++, '\\');
11163 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011164 }
11165 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 PyUnicode_WRITE(okind, odata, o++, '\\');
11167 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011168 }
11169
11170 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011171 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 PyUnicode_WRITE(okind, odata, o++, '\\');
11173 PyUnicode_WRITE(okind, odata, o++, 'x');
11174 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11175 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011176 }
11177
Georg Brandl559e5d72008-06-11 18:37:52 +000011178 /* Copy ASCII characters as-is */
11179 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011181 }
11182
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011184 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011185 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011186 (categories Z* and C* except ASCII space)
11187 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011189 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 if (ch <= 0xff) {
11191 PyUnicode_WRITE(okind, odata, o++, '\\');
11192 PyUnicode_WRITE(okind, odata, o++, 'x');
11193 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11194 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011195 }
11196 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 else if (ch >= 0x10000) {
11198 PyUnicode_WRITE(okind, odata, o++, '\\');
11199 PyUnicode_WRITE(okind, odata, o++, 'U');
11200 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11201 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11205 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11206 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11207 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011208 }
11209 /* Map 16-bit characters to '\uxxxx' */
11210 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 PyUnicode_WRITE(okind, odata, o++, '\\');
11212 PyUnicode_WRITE(okind, odata, o++, 'u');
11213 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11214 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11215 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11216 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011217 }
11218 }
11219 /* Copy characters as-is */
11220 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011222 }
11223 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011226 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227}
11228
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231\n\
11232Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011233such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234arguments start and end are interpreted as in slice notation.\n\
11235\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
11238static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011239unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240{
Jesus Ceaac451502011-04-20 17:09:23 +020011241 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011242 Py_ssize_t start;
11243 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011244 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Jesus Ceaac451502011-04-20 17:09:23 +020011246 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11247 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011248 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (PyUnicode_READY(self) == -1)
11251 return NULL;
11252 if (PyUnicode_READY(substring) == -1)
11253 return NULL;
11254
11255 result = any_find_slice(
11256 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11257 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011258 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
11260 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (result == -2)
11263 return NULL;
11264
Christian Heimes217cfd12007-12-02 14:31:20 +000011265 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266}
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011271Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Jesus Ceaac451502011-04-20 17:09:23 +020011276 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011277 Py_ssize_t start;
11278 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011279 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Jesus Ceaac451502011-04-20 17:09:23 +020011281 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11282 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287 if (PyUnicode_READY(substring) == -1)
11288 return NULL;
11289
11290 result = any_find_slice(
11291 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11292 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
11295 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (result == -2)
11298 return NULL;
11299
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 if (result < 0) {
11301 PyErr_SetString(PyExc_ValueError, "substring not found");
11302 return NULL;
11303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304
Christian Heimes217cfd12007-12-02 14:31:20 +000011305 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306}
11307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011308PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011309 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011311Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011312done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject *
11315unicode_rjust(PyUnicodeObject *self, PyObject *args)
11316{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011317 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 Py_UCS4 fillchar = ' ';
11319
Victor Stinnere9a29352011-10-01 02:14:59 +020011320 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011322
Victor Stinnere9a29352011-10-01 02:14:59 +020011323 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 return NULL;
11325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 Py_INCREF(self);
11328 return (PyObject*) self;
11329 }
11330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332}
11333
Alexander Belopolsky40018472011-02-26 01:02:56 +000011334PyObject *
11335PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336{
11337 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011338
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 s = PyUnicode_FromObject(s);
11340 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011341 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 if (sep != NULL) {
11343 sep = PyUnicode_FromObject(sep);
11344 if (sep == NULL) {
11345 Py_DECREF(s);
11346 return NULL;
11347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 }
11349
11350 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11351
11352 Py_DECREF(s);
11353 Py_XDECREF(sep);
11354 return result;
11355}
11356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359\n\
11360Return a list of the words in S, using sep as the\n\
11361delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011362splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011363whitespace string is a separator and empty strings are\n\
11364removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
11366static PyObject*
11367unicode_split(PyUnicodeObject *self, PyObject *args)
11368{
11369 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011370 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Martin v. Löwis18e16552006-02-15 17:27:45 +000011372 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011373 return NULL;
11374
11375 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011376 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011377 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381}
11382
Thomas Wouters477c8d52006-05-27 19:21:47 +000011383PyObject *
11384PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11385{
11386 PyObject* str_obj;
11387 PyObject* sep_obj;
11388 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 int kind1, kind2, kind;
11390 void *buf1 = NULL, *buf2 = NULL;
11391 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011392
11393 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011394 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011396 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011398 Py_DECREF(str_obj);
11399 return NULL;
11400 }
11401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 kind1 = PyUnicode_KIND(str_in);
11403 kind2 = PyUnicode_KIND(sep_obj);
11404 kind = kind1 > kind2 ? kind1 : kind2;
11405 buf1 = PyUnicode_DATA(str_in);
11406 if (kind1 != kind)
11407 buf1 = _PyUnicode_AsKind(str_in, kind);
11408 if (!buf1)
11409 goto onError;
11410 buf2 = PyUnicode_DATA(sep_obj);
11411 if (kind2 != kind)
11412 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11413 if (!buf2)
11414 goto onError;
11415 len1 = PyUnicode_GET_LENGTH(str_obj);
11416 len2 = PyUnicode_GET_LENGTH(sep_obj);
11417
11418 switch(PyUnicode_KIND(str_in)) {
11419 case PyUnicode_1BYTE_KIND:
11420 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11421 break;
11422 case PyUnicode_2BYTE_KIND:
11423 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11424 break;
11425 case PyUnicode_4BYTE_KIND:
11426 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11427 break;
11428 default:
11429 assert(0);
11430 out = 0;
11431 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432
11433 Py_DECREF(sep_obj);
11434 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011435 if (kind1 != kind)
11436 PyMem_Free(buf1);
11437 if (kind2 != kind)
11438 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011439
11440 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 onError:
11442 Py_DECREF(sep_obj);
11443 Py_DECREF(str_obj);
11444 if (kind1 != kind && buf1)
11445 PyMem_Free(buf1);
11446 if (kind2 != kind && buf2)
11447 PyMem_Free(buf2);
11448 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011449}
11450
11451
11452PyObject *
11453PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11454{
11455 PyObject* str_obj;
11456 PyObject* sep_obj;
11457 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 int kind1, kind2, kind;
11459 void *buf1 = NULL, *buf2 = NULL;
11460 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011461
11462 str_obj = PyUnicode_FromObject(str_in);
11463 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011465 sep_obj = PyUnicode_FromObject(sep_in);
11466 if (!sep_obj) {
11467 Py_DECREF(str_obj);
11468 return NULL;
11469 }
11470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 kind1 = PyUnicode_KIND(str_in);
11472 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011473 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 buf1 = PyUnicode_DATA(str_in);
11475 if (kind1 != kind)
11476 buf1 = _PyUnicode_AsKind(str_in, kind);
11477 if (!buf1)
11478 goto onError;
11479 buf2 = PyUnicode_DATA(sep_obj);
11480 if (kind2 != kind)
11481 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11482 if (!buf2)
11483 goto onError;
11484 len1 = PyUnicode_GET_LENGTH(str_obj);
11485 len2 = PyUnicode_GET_LENGTH(sep_obj);
11486
11487 switch(PyUnicode_KIND(str_in)) {
11488 case PyUnicode_1BYTE_KIND:
11489 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11490 break;
11491 case PyUnicode_2BYTE_KIND:
11492 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11493 break;
11494 case PyUnicode_4BYTE_KIND:
11495 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11496 break;
11497 default:
11498 assert(0);
11499 out = 0;
11500 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501
11502 Py_DECREF(sep_obj);
11503 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (kind1 != kind)
11505 PyMem_Free(buf1);
11506 if (kind2 != kind)
11507 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011508
11509 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 onError:
11511 Py_DECREF(sep_obj);
11512 Py_DECREF(str_obj);
11513 if (kind1 != kind && buf1)
11514 PyMem_Free(buf1);
11515 if (kind2 != kind && buf2)
11516 PyMem_Free(buf2);
11517 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011518}
11519
11520PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011522\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011523Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011524the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011525found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011526
11527static PyObject*
11528unicode_partition(PyUnicodeObject *self, PyObject *separator)
11529{
11530 return PyUnicode_Partition((PyObject *)self, separator);
11531}
11532
11533PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011534 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011535\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011536Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011537the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011538separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011539
11540static PyObject*
11541unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11542{
11543 return PyUnicode_RPartition((PyObject *)self, separator);
11544}
11545
Alexander Belopolsky40018472011-02-26 01:02:56 +000011546PyObject *
11547PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011548{
11549 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011551 s = PyUnicode_FromObject(s);
11552 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 if (sep != NULL) {
11555 sep = PyUnicode_FromObject(sep);
11556 if (sep == NULL) {
11557 Py_DECREF(s);
11558 return NULL;
11559 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011560 }
11561
11562 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11563
11564 Py_DECREF(s);
11565 Py_XDECREF(sep);
11566 return result;
11567}
11568
11569PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011570 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011571\n\
11572Return a list of the words in S, using sep as the\n\
11573delimiter string, starting at the end of the string and\n\
11574working to the front. If maxsplit is given, at most maxsplit\n\
11575splits are done. If sep is not specified, any whitespace string\n\
11576is a separator.");
11577
11578static PyObject*
11579unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11580{
11581 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011582 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011583
Martin v. Löwis18e16552006-02-15 17:27:45 +000011584 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011585 return NULL;
11586
11587 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011589 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011591 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011592 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011593}
11594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011595PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597\n\
11598Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011599Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011603unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011605 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011606 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011608 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11609 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610 return NULL;
11611
Guido van Rossum86662912000-04-11 15:38:46 +000011612 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613}
11614
11615static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011616PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Walter Dörwald346737f2007-05-31 10:44:43 +000011618 if (PyUnicode_CheckExact(self)) {
11619 Py_INCREF(self);
11620 return self;
11621 } else
11622 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011623 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624}
11625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628\n\
11629Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011630and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011633unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635 return fixup(self, fixswapcase);
11636}
11637
Georg Brandlceee0772007-11-27 23:48:05 +000011638PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011640\n\
11641Return a translation table usable for str.translate().\n\
11642If there is only one argument, it must be a dictionary mapping Unicode\n\
11643ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011644Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011645If there are two arguments, they must be strings of equal length, and\n\
11646in the resulting dictionary, each character in x will be mapped to the\n\
11647character at the same position in y. If there is a third argument, it\n\
11648must be a string, whose characters will be mapped to None in the result.");
11649
11650static PyObject*
11651unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11652{
11653 PyObject *x, *y = NULL, *z = NULL;
11654 PyObject *new = NULL, *key, *value;
11655 Py_ssize_t i = 0;
11656 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657
Georg Brandlceee0772007-11-27 23:48:05 +000011658 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11659 return NULL;
11660 new = PyDict_New();
11661 if (!new)
11662 return NULL;
11663 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 int x_kind, y_kind, z_kind;
11665 void *x_data, *y_data, *z_data;
11666
Georg Brandlceee0772007-11-27 23:48:05 +000011667 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011668 if (!PyUnicode_Check(x)) {
11669 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11670 "be a string if there is a second argument");
11671 goto err;
11672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011674 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11675 "arguments must have equal length");
11676 goto err;
11677 }
11678 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 x_kind = PyUnicode_KIND(x);
11680 y_kind = PyUnicode_KIND(y);
11681 x_data = PyUnicode_DATA(x);
11682 y_data = PyUnicode_DATA(y);
11683 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11684 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11685 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011686 if (!key || !value)
11687 goto err;
11688 res = PyDict_SetItem(new, key, value);
11689 Py_DECREF(key);
11690 Py_DECREF(value);
11691 if (res < 0)
11692 goto err;
11693 }
11694 /* create entries for deleting chars in z */
11695 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 z_kind = PyUnicode_KIND(z);
11697 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011698 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011700 if (!key)
11701 goto err;
11702 res = PyDict_SetItem(new, key, Py_None);
11703 Py_DECREF(key);
11704 if (res < 0)
11705 goto err;
11706 }
11707 }
11708 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 int kind;
11710 void *data;
11711
Georg Brandlceee0772007-11-27 23:48:05 +000011712 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011713 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011714 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11715 "to maketrans it must be a dict");
11716 goto err;
11717 }
11718 /* copy entries into the new dict, converting string keys to int keys */
11719 while (PyDict_Next(x, &i, &key, &value)) {
11720 if (PyUnicode_Check(key)) {
11721 /* convert string keys to integer keys */
11722 PyObject *newkey;
11723 if (PyUnicode_GET_SIZE(key) != 1) {
11724 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11725 "table must be of length 1");
11726 goto err;
11727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 kind = PyUnicode_KIND(key);
11729 data = PyUnicode_DATA(key);
11730 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011731 if (!newkey)
11732 goto err;
11733 res = PyDict_SetItem(new, newkey, value);
11734 Py_DECREF(newkey);
11735 if (res < 0)
11736 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011737 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011738 /* just keep integer keys */
11739 if (PyDict_SetItem(new, key, value) < 0)
11740 goto err;
11741 } else {
11742 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11743 "be strings or integers");
11744 goto err;
11745 }
11746 }
11747 }
11748 return new;
11749 err:
11750 Py_DECREF(new);
11751 return NULL;
11752}
11753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011754PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011755 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756\n\
11757Return a copy of the string S, where all characters have been mapped\n\
11758through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011759Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011760Unmapped characters are left untouched. Characters mapped to None\n\
11761are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767}
11768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011775unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 return fixup(self, fixupper);
11778}
11779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011780PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011783Pad a numeric string S with zeros on the left, to fill a field\n\
11784of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
11786static PyObject *
11787unicode_zfill(PyUnicodeObject *self, PyObject *args)
11788{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011789 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011791 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 int kind;
11793 void *data;
11794 Py_UCS4 chr;
11795
11796 if (PyUnicode_READY(self) == -1)
11797 return NULL;
11798
Martin v. Löwis18e16552006-02-15 17:27:45 +000011799 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 return NULL;
11801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011803 if (PyUnicode_CheckExact(self)) {
11804 Py_INCREF(self);
11805 return (PyObject*) self;
11806 }
11807 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011808 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 }
11810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
11813 u = pad(self, fill, 0, '0');
11814
Walter Dörwald068325e2002-04-15 13:36:47 +000011815 if (u == NULL)
11816 return NULL;
11817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 kind = PyUnicode_KIND(u);
11819 data = PyUnicode_DATA(u);
11820 chr = PyUnicode_READ(kind, data, fill);
11821
11822 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 PyUnicode_WRITE(kind, data, 0, chr);
11825 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 }
11827
11828 return (PyObject*) u;
11829}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830
11831#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011832static PyObject *
11833unicode__decimal2ascii(PyObject *self)
11834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011836}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837#endif
11838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011839PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011842Return True if S starts with the specified prefix, False otherwise.\n\
11843With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011844With optional end, stop comparing S at that position.\n\
11845prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846
11847static PyObject *
11848unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011851 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011853 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011854 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011855 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856
Jesus Ceaac451502011-04-20 17:09:23 +020011857 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011859 if (PyTuple_Check(subobj)) {
11860 Py_ssize_t i;
11861 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11862 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011864 if (substring == NULL)
11865 return NULL;
11866 result = tailmatch(self, substring, start, end, -1);
11867 Py_DECREF(substring);
11868 if (result) {
11869 Py_RETURN_TRUE;
11870 }
11871 }
11872 /* nothing matched */
11873 Py_RETURN_FALSE;
11874 }
11875 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011876 if (substring == NULL) {
11877 if (PyErr_ExceptionMatches(PyExc_TypeError))
11878 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11879 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011880 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011881 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011882 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011884 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885}
11886
11887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011888PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011889 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011891Return True if S ends with the specified suffix, False otherwise.\n\
11892With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011893With optional end, stop comparing S at that position.\n\
11894suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
11896static PyObject *
11897unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011898 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011900 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011902 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011903 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011904 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Jesus Ceaac451502011-04-20 17:09:23 +020011906 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011908 if (PyTuple_Check(subobj)) {
11909 Py_ssize_t i;
11910 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11911 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011912 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011913 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011915 result = tailmatch(self, substring, start, end, +1);
11916 Py_DECREF(substring);
11917 if (result) {
11918 Py_RETURN_TRUE;
11919 }
11920 }
11921 Py_RETURN_FALSE;
11922 }
11923 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011924 if (substring == NULL) {
11925 if (PyErr_ExceptionMatches(PyExc_TypeError))
11926 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11927 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011929 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011930 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011932 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933}
11934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011936
11937PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011939\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011940Return a formatted version of S, using substitutions from args and kwargs.\n\
11941The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011942
Eric Smith27bbca62010-11-04 17:06:58 +000011943PyDoc_STRVAR(format_map__doc__,
11944 "S.format_map(mapping) -> str\n\
11945\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011946Return a formatted version of S, using substitutions from mapping.\n\
11947The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011948
Eric Smith4a7d76d2008-05-30 18:10:19 +000011949static PyObject *
11950unicode__format__(PyObject* self, PyObject* args)
11951{
11952 PyObject *format_spec;
11953
11954 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11955 return NULL;
11956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11958 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011959}
11960
Eric Smith8c663262007-08-25 02:26:07 +000011961PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011963\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011964Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011965
11966static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011967unicode__sizeof__(PyUnicodeObject *v)
11968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 Py_ssize_t size;
11970
11971 /* If it's a compact object, account for base structure +
11972 character data. */
11973 if (PyUnicode_IS_COMPACT_ASCII(v))
11974 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11975 else if (PyUnicode_IS_COMPACT(v))
11976 size = sizeof(PyCompactUnicodeObject) +
11977 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11978 else {
11979 /* If it is a two-block object, account for base object, and
11980 for character block if present. */
11981 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011982 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 size += (PyUnicode_GET_LENGTH(v) + 1) *
11984 PyUnicode_CHARACTER_SIZE(v);
11985 }
11986 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011987 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011989 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011991 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011992 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993
11994 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011995}
11996
11997PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011999
12000static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012001unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012002{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012003 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (!copy)
12005 return NULL;
12006 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012007}
12008
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009static PyMethodDef unicode_methods[] = {
12010
12011 /* Order is according to common usage: often used methods should
12012 appear first, since lookup is done sequentially. */
12013
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012014 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012015 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12016 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012017 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012018 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12019 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12020 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12021 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12022 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12023 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12024 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012025 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012026 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12027 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12028 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012029 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012030 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12031 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12032 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012033 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012034 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012035 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012036 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012037 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12038 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12039 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12040 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12041 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12042 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12043 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12044 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12045 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12046 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12047 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12048 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12049 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12050 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012051 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012053 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012054 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012055 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012056 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012057 {"maketrans", (PyCFunction) unicode_maketrans,
12058 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012059 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012060#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012061 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062#endif
12063
12064#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012065 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012066 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067#endif
12068
Benjamin Peterson14339b62009-01-31 16:36:08 +000012069 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070 {NULL, NULL}
12071};
12072
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012073static PyObject *
12074unicode_mod(PyObject *v, PyObject *w)
12075{
Brian Curtindfc80e32011-08-10 20:28:54 -050012076 if (!PyUnicode_Check(v))
12077 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012079}
12080
12081static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012082 0, /*nb_add*/
12083 0, /*nb_subtract*/
12084 0, /*nb_multiply*/
12085 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012086};
12087
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012089 (lenfunc) unicode_length, /* sq_length */
12090 PyUnicode_Concat, /* sq_concat */
12091 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12092 (ssizeargfunc) unicode_getitem, /* sq_item */
12093 0, /* sq_slice */
12094 0, /* sq_ass_item */
12095 0, /* sq_ass_slice */
12096 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097};
12098
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012099static PyObject*
12100unicode_subscript(PyUnicodeObject* self, PyObject* item)
12101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (PyUnicode_READY(self) == -1)
12103 return NULL;
12104
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012105 if (PyIndex_Check(item)) {
12106 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012107 if (i == -1 && PyErr_Occurred())
12108 return NULL;
12109 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012111 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012112 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012113 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012115 Py_UNICODE* result_buf;
12116 PyObject* result;
12117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012120 return NULL;
12121 }
12122
12123 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 return PyUnicode_New(0, 0);
12125 } else if (start == 0 && step == 1 &&
12126 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012127 PyUnicode_CheckExact(self)) {
12128 Py_INCREF(self);
12129 return (PyObject *)self;
12130 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012131 return PyUnicode_Substring((PyObject*)self,
12132 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012133 } else {
12134 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012135 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12136 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 if (result_buf == NULL)
12139 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012140
12141 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12142 result_buf[i] = source_buf[cur];
12143 }
Tim Petersced69f82003-09-16 20:30:58 +000012144
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012145 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012146 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012147 return result;
12148 }
12149 } else {
12150 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12151 return NULL;
12152 }
12153}
12154
12155static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012156 (lenfunc)unicode_length, /* mp_length */
12157 (binaryfunc)unicode_subscript, /* mp_subscript */
12158 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012159};
12160
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162/* Helpers for PyUnicode_Format() */
12163
12164static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012165getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012167 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012169 (*p_argidx)++;
12170 if (arglen < 0)
12171 return args;
12172 else
12173 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174 }
12175 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 return NULL;
12178}
12179
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012180/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012182static PyObject *
12183formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012185 char *p;
12186 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012188
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189 x = PyFloat_AsDouble(v);
12190 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012191 return NULL;
12192
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012194 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012195
Eric Smith0923d1d2009-04-16 20:16:10 +000012196 p = PyOS_double_to_string(x, type, prec,
12197 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012198 if (p == NULL)
12199 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012201 PyMem_Free(p);
12202 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203}
12204
Tim Peters38fd5b62000-09-21 05:43:11 +000012205static PyObject*
12206formatlong(PyObject *val, int flags, int prec, int type)
12207{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012208 char *buf;
12209 int len;
12210 PyObject *str; /* temporary string object. */
12211 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012212
Benjamin Peterson14339b62009-01-31 16:36:08 +000012213 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12214 if (!str)
12215 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012216 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012217 Py_DECREF(str);
12218 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012219}
12220
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012223 size_t buflen,
12224 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012226 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012227 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (PyUnicode_GET_LENGTH(v) == 1) {
12229 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 buf[1] = '\0';
12231 return 1;
12232 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 goto onError;
12234 }
12235 else {
12236 /* Integer input truncated to a character */
12237 long x;
12238 x = PyLong_AsLong(v);
12239 if (x == -1 && PyErr_Occurred())
12240 goto onError;
12241
12242 if (x < 0 || x > 0x10ffff) {
12243 PyErr_SetString(PyExc_OverflowError,
12244 "%c arg not in range(0x110000)");
12245 return -1;
12246 }
12247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 buf[1] = '\0';
12250 return 1;
12251 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012252
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012254 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012256 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257}
12258
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012259/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012260 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012261*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012262#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012263
Alexander Belopolsky40018472011-02-26 01:02:56 +000012264PyObject *
12265PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 void *fmt;
12268 int fmtkind;
12269 PyObject *result;
12270 Py_UCS4 *res, *res0;
12271 Py_UCS4 max;
12272 int kind;
12273 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012277
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 PyErr_BadInternalCall();
12280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12283 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 fmt = PyUnicode_DATA(uformat);
12286 fmtkind = PyUnicode_KIND(uformat);
12287 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12288 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289
12290 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12292 if (res0 == NULL) {
12293 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 arglen = PyTuple_Size(args);
12299 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 }
12301 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012302 arglen = -1;
12303 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012305 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012306 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308
12309 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 if (--rescnt < 0) {
12312 rescnt = fmtcnt + 100;
12313 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12315 if (res0 == NULL){
12316 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 }
12319 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012323 }
12324 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 /* Got a format specifier */
12326 int flags = 0;
12327 Py_ssize_t width = -1;
12328 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 Py_UCS4 c = '\0';
12330 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 int isnumok;
12332 PyObject *v = NULL;
12333 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 void *pbuf;
12335 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 Py_ssize_t len, len1;
12338 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 fmtpos++;
12341 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12342 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 Py_ssize_t keylen;
12344 PyObject *key;
12345 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012346
Benjamin Peterson29060642009-01-31 22:14:21 +000012347 if (dict == NULL) {
12348 PyErr_SetString(PyExc_TypeError,
12349 "format requires a mapping");
12350 goto onError;
12351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 /* Skip over balanced parentheses */
12356 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012358 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 if (fmtcnt < 0 || pcount > 0) {
12365 PyErr_SetString(PyExc_ValueError,
12366 "incomplete format key");
12367 goto onError;
12368 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012369 key = PyUnicode_Substring((PyObject*)uformat,
12370 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012371 if (key == NULL)
12372 goto onError;
12373 if (args_owned) {
12374 Py_DECREF(args);
12375 args_owned = 0;
12376 }
12377 args = PyObject_GetItem(dict, key);
12378 Py_DECREF(key);
12379 if (args == NULL) {
12380 goto onError;
12381 }
12382 args_owned = 1;
12383 arglen = -1;
12384 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012386 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012388 case '-': flags |= F_LJUST; continue;
12389 case '+': flags |= F_SIGN; continue;
12390 case ' ': flags |= F_BLANK; continue;
12391 case '#': flags |= F_ALT; continue;
12392 case '0': flags |= F_ZERO; continue;
12393 }
12394 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012395 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012396 if (c == '*') {
12397 v = getnextarg(args, arglen, &argidx);
12398 if (v == NULL)
12399 goto onError;
12400 if (!PyLong_Check(v)) {
12401 PyErr_SetString(PyExc_TypeError,
12402 "* wants int");
12403 goto onError;
12404 }
12405 width = PyLong_AsLong(v);
12406 if (width == -1 && PyErr_Occurred())
12407 goto onError;
12408 if (width < 0) {
12409 flags |= F_LJUST;
12410 width = -width;
12411 }
12412 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 }
12415 else if (c >= '0' && c <= '9') {
12416 width = c - '0';
12417 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012419 if (c < '0' || c > '9')
12420 break;
12421 if ((width*10) / 10 != width) {
12422 PyErr_SetString(PyExc_ValueError,
12423 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012424 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 }
12426 width = width*10 + (c - '0');
12427 }
12428 }
12429 if (c == '.') {
12430 prec = 0;
12431 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 if (c == '*') {
12434 v = getnextarg(args, arglen, &argidx);
12435 if (v == NULL)
12436 goto onError;
12437 if (!PyLong_Check(v)) {
12438 PyErr_SetString(PyExc_TypeError,
12439 "* wants int");
12440 goto onError;
12441 }
12442 prec = PyLong_AsLong(v);
12443 if (prec == -1 && PyErr_Occurred())
12444 goto onError;
12445 if (prec < 0)
12446 prec = 0;
12447 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 }
12450 else if (c >= '0' && c <= '9') {
12451 prec = c - '0';
12452 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 if (c < '0' || c > '9')
12455 break;
12456 if ((prec*10) / 10 != prec) {
12457 PyErr_SetString(PyExc_ValueError,
12458 "prec too big");
12459 goto onError;
12460 }
12461 prec = prec*10 + (c - '0');
12462 }
12463 }
12464 } /* prec */
12465 if (fmtcnt >= 0) {
12466 if (c == 'h' || c == 'l' || c == 'L') {
12467 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012469 }
12470 }
12471 if (fmtcnt < 0) {
12472 PyErr_SetString(PyExc_ValueError,
12473 "incomplete format");
12474 goto onError;
12475 }
12476 if (c != '%') {
12477 v = getnextarg(args, arglen, &argidx);
12478 if (v == NULL)
12479 goto onError;
12480 }
12481 sign = 0;
12482 fill = ' ';
12483 switch (c) {
12484
12485 case '%':
12486 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012490 len = 1;
12491 break;
12492
12493 case 's':
12494 case 'r':
12495 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012496 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012497 temp = v;
12498 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012499 }
12500 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012501 if (c == 's')
12502 temp = PyObject_Str(v);
12503 else if (c == 'r')
12504 temp = PyObject_Repr(v);
12505 else
12506 temp = PyObject_ASCII(v);
12507 if (temp == NULL)
12508 goto onError;
12509 if (PyUnicode_Check(temp))
12510 /* nothing to do */;
12511 else {
12512 Py_DECREF(temp);
12513 PyErr_SetString(PyExc_TypeError,
12514 "%s argument has non-string str()");
12515 goto onError;
12516 }
12517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518 if (PyUnicode_READY(temp) == -1) {
12519 Py_CLEAR(temp);
12520 goto onError;
12521 }
12522 pbuf = PyUnicode_DATA(temp);
12523 kind = PyUnicode_KIND(temp);
12524 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 if (prec >= 0 && len > prec)
12526 len = prec;
12527 break;
12528
12529 case 'i':
12530 case 'd':
12531 case 'u':
12532 case 'o':
12533 case 'x':
12534 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 isnumok = 0;
12536 if (PyNumber_Check(v)) {
12537 PyObject *iobj=NULL;
12538
12539 if (PyLong_Check(v)) {
12540 iobj = v;
12541 Py_INCREF(iobj);
12542 }
12543 else {
12544 iobj = PyNumber_Long(v);
12545 }
12546 if (iobj!=NULL) {
12547 if (PyLong_Check(iobj)) {
12548 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012549 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 Py_DECREF(iobj);
12551 if (!temp)
12552 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 if (PyUnicode_READY(temp) == -1) {
12554 Py_CLEAR(temp);
12555 goto onError;
12556 }
12557 pbuf = PyUnicode_DATA(temp);
12558 kind = PyUnicode_KIND(temp);
12559 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 sign = 1;
12561 }
12562 else {
12563 Py_DECREF(iobj);
12564 }
12565 }
12566 }
12567 if (!isnumok) {
12568 PyErr_Format(PyExc_TypeError,
12569 "%%%c format: a number is required, "
12570 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12571 goto onError;
12572 }
12573 if (flags & F_ZERO)
12574 fill = '0';
12575 break;
12576
12577 case 'e':
12578 case 'E':
12579 case 'f':
12580 case 'F':
12581 case 'g':
12582 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012583 temp = formatfloat(v, flags, prec, c);
12584 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 if (PyUnicode_READY(temp) == -1) {
12587 Py_CLEAR(temp);
12588 goto onError;
12589 }
12590 pbuf = PyUnicode_DATA(temp);
12591 kind = PyUnicode_KIND(temp);
12592 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 sign = 1;
12594 if (flags & F_ZERO)
12595 fill = '0';
12596 break;
12597
12598 case 'c':
12599 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012601 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 if (len < 0)
12603 goto onError;
12604 break;
12605
12606 default:
12607 PyErr_Format(PyExc_ValueError,
12608 "unsupported format character '%c' (0x%x) "
12609 "at index %zd",
12610 (31<=c && c<=126) ? (char)c : '?',
12611 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012613 goto onError;
12614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 /* pbuf is initialized here. */
12616 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12619 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12620 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 len--;
12622 }
12623 else if (flags & F_SIGN)
12624 sign = '+';
12625 else if (flags & F_BLANK)
12626 sign = ' ';
12627 else
12628 sign = 0;
12629 }
12630 if (width < len)
12631 width = len;
12632 if (rescnt - (sign != 0) < width) {
12633 reslen -= rescnt;
12634 rescnt = width + fmtcnt + 100;
12635 reslen += rescnt;
12636 if (reslen < 0) {
12637 Py_XDECREF(temp);
12638 PyErr_NoMemory();
12639 goto onError;
12640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12642 if (res0 == 0) {
12643 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 Py_XDECREF(temp);
12645 goto onError;
12646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 }
12649 if (sign) {
12650 if (fill != ' ')
12651 *res++ = sign;
12652 rescnt--;
12653 if (width > len)
12654 width--;
12655 }
12656 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12658 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12661 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 }
12663 rescnt -= 2;
12664 width -= 2;
12665 if (width < 0)
12666 width = 0;
12667 len -= 2;
12668 }
12669 if (width > len && !(flags & F_LJUST)) {
12670 do {
12671 --rescnt;
12672 *res++ = fill;
12673 } while (--width > len);
12674 }
12675 if (fill == ' ') {
12676 if (sign)
12677 *res++ = sign;
12678 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12680 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12681 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12682 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012683 }
12684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 /* Copy all characters, preserving len */
12686 len1 = len;
12687 while (len1--) {
12688 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12689 rescnt--;
12690 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012691 while (--width >= len) {
12692 --rescnt;
12693 *res++ = ' ';
12694 }
12695 if (dict && (argidx < arglen) && c != '%') {
12696 PyErr_SetString(PyExc_TypeError,
12697 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012698 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 goto onError;
12700 }
12701 Py_XDECREF(temp);
12702 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703 } /* until end */
12704 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 PyErr_SetString(PyExc_TypeError,
12706 "not all arguments converted during string formatting");
12707 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708 }
12709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710
12711 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12712 if (*res > max)
12713 max = *res;
12714 result = PyUnicode_New(reslen - rescnt, max);
12715 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012716 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 kind = PyUnicode_KIND(result);
12718 for (res = res0; res < res0+reslen-rescnt; res++)
12719 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12720 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 }
12724 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725 return (PyObject *)result;
12726
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729 Py_DECREF(uformat);
12730 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012731 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 }
12733 return NULL;
12734}
12735
Jeremy Hylton938ace62002-07-17 16:30:39 +000012736static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012737unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12738
Tim Peters6d6c1a32001-08-02 04:15:00 +000012739static PyObject *
12740unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12741{
Benjamin Peterson29060642009-01-31 22:14:21 +000012742 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 static char *kwlist[] = {"object", "encoding", "errors", 0};
12744 char *encoding = NULL;
12745 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012746
Benjamin Peterson14339b62009-01-31 16:36:08 +000012747 if (type != &PyUnicode_Type)
12748 return unicode_subtype_new(type, args, kwds);
12749 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012751 return NULL;
12752 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012754 if (encoding == NULL && errors == NULL)
12755 return PyObject_Str(x);
12756 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012758}
12759
Guido van Rossume023fe02001-08-30 03:12:59 +000012760static PyObject *
12761unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12762{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012763 PyUnicodeObject *unicode, *self;
12764 Py_ssize_t length, char_size;
12765 int share_wstr, share_utf8;
12766 unsigned int kind;
12767 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012768
Benjamin Peterson14339b62009-01-31 16:36:08 +000012769 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012770
12771 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12772 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012773 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012774 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012775 if (PyUnicode_READY(unicode))
12776 return NULL;
12777
12778 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12779 if (self == NULL) {
12780 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 return NULL;
12782 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012783 kind = PyUnicode_KIND(unicode);
12784 length = PyUnicode_GET_LENGTH(unicode);
12785
12786 _PyUnicode_LENGTH(self) = length;
12787 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12788 _PyUnicode_STATE(self).interned = 0;
12789 _PyUnicode_STATE(self).kind = kind;
12790 _PyUnicode_STATE(self).compact = 0;
12791 _PyUnicode_STATE(self).ascii = 0;
12792 _PyUnicode_STATE(self).ready = 1;
12793 _PyUnicode_WSTR(self) = NULL;
12794 _PyUnicode_UTF8_LENGTH(self) = 0;
12795 _PyUnicode_UTF8(self) = NULL;
12796 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012797 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012798
12799 share_utf8 = 0;
12800 share_wstr = 0;
12801 if (kind == PyUnicode_1BYTE_KIND) {
12802 char_size = 1;
12803 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12804 share_utf8 = 1;
12805 }
12806 else if (kind == PyUnicode_2BYTE_KIND) {
12807 char_size = 2;
12808 if (sizeof(wchar_t) == 2)
12809 share_wstr = 1;
12810 }
12811 else {
12812 assert(kind == PyUnicode_4BYTE_KIND);
12813 char_size = 4;
12814 if (sizeof(wchar_t) == 4)
12815 share_wstr = 1;
12816 }
12817
12818 /* Ensure we won't overflow the length. */
12819 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12820 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012822 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012823 data = PyObject_MALLOC((length + 1) * char_size);
12824 if (data == NULL) {
12825 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012826 goto onError;
12827 }
12828
Victor Stinnerc3c74152011-10-02 20:39:55 +020012829 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012830 if (share_utf8) {
12831 _PyUnicode_UTF8_LENGTH(self) = length;
12832 _PyUnicode_UTF8(self) = data;
12833 }
12834 if (share_wstr) {
12835 _PyUnicode_WSTR_LENGTH(self) = length;
12836 _PyUnicode_WSTR(self) = (wchar_t *)data;
12837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012839 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12840 PyUnicode_KIND_SIZE(kind, length + 1));
12841 Py_DECREF(unicode);
12842 return (PyObject *)self;
12843
12844onError:
12845 Py_DECREF(unicode);
12846 Py_DECREF(self);
12847 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012848}
12849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012850PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012851 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012852\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012853Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012854encoding defaults to the current default string encoding.\n\
12855errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012856
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012857static PyObject *unicode_iter(PyObject *seq);
12858
Guido van Rossumd57fd912000-03-10 22:53:23 +000012859PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012860 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012861 "str", /* tp_name */
12862 sizeof(PyUnicodeObject), /* tp_size */
12863 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012865 (destructor)unicode_dealloc, /* tp_dealloc */
12866 0, /* tp_print */
12867 0, /* tp_getattr */
12868 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012869 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012870 unicode_repr, /* tp_repr */
12871 &unicode_as_number, /* tp_as_number */
12872 &unicode_as_sequence, /* tp_as_sequence */
12873 &unicode_as_mapping, /* tp_as_mapping */
12874 (hashfunc) unicode_hash, /* tp_hash*/
12875 0, /* tp_call*/
12876 (reprfunc) unicode_str, /* tp_str */
12877 PyObject_GenericGetAttr, /* tp_getattro */
12878 0, /* tp_setattro */
12879 0, /* tp_as_buffer */
12880 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 unicode_doc, /* tp_doc */
12883 0, /* tp_traverse */
12884 0, /* tp_clear */
12885 PyUnicode_RichCompare, /* tp_richcompare */
12886 0, /* tp_weaklistoffset */
12887 unicode_iter, /* tp_iter */
12888 0, /* tp_iternext */
12889 unicode_methods, /* tp_methods */
12890 0, /* tp_members */
12891 0, /* tp_getset */
12892 &PyBaseObject_Type, /* tp_base */
12893 0, /* tp_dict */
12894 0, /* tp_descr_get */
12895 0, /* tp_descr_set */
12896 0, /* tp_dictoffset */
12897 0, /* tp_init */
12898 0, /* tp_alloc */
12899 unicode_new, /* tp_new */
12900 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901};
12902
12903/* Initialize the Unicode implementation */
12904
Thomas Wouters78890102000-07-22 19:25:51 +000012905void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012907 int i;
12908
Thomas Wouters477c8d52006-05-27 19:21:47 +000012909 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012911 0x000A, /* LINE FEED */
12912 0x000D, /* CARRIAGE RETURN */
12913 0x001C, /* FILE SEPARATOR */
12914 0x001D, /* GROUP SEPARATOR */
12915 0x001E, /* RECORD SEPARATOR */
12916 0x0085, /* NEXT LINE */
12917 0x2028, /* LINE SEPARATOR */
12918 0x2029, /* PARAGRAPH SEPARATOR */
12919 };
12920
Fred Drakee4315f52000-05-09 19:53:39 +000012921 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012922 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012923 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012925
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012926 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012928 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
12931 /* initialize the linebreak bloom filter */
12932 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012934 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012935
12936 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012937}
12938
12939/* Finalize the Unicode implementation */
12940
Christian Heimesa156e092008-02-16 07:38:31 +000012941int
12942PyUnicode_ClearFreeList(void)
12943{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012945}
12946
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947void
Thomas Wouters78890102000-07-22 19:25:51 +000012948_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012950 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012952 Py_XDECREF(unicode_empty);
12953 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012954
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012955 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012956 if (unicode_latin1[i]) {
12957 Py_DECREF(unicode_latin1[i]);
12958 unicode_latin1[i] = NULL;
12959 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012960 }
Christian Heimesa156e092008-02-16 07:38:31 +000012961 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012963
Walter Dörwald16807132007-05-25 13:52:07 +000012964void
12965PyUnicode_InternInPlace(PyObject **p)
12966{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012967 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12968 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012969#ifdef Py_DEBUG
12970 assert(s != NULL);
12971 assert(_PyUnicode_CHECK(s));
12972#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012974 return;
12975#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012976 /* If it's a subclass, we don't really know what putting
12977 it in the interned dict might do. */
12978 if (!PyUnicode_CheckExact(s))
12979 return;
12980 if (PyUnicode_CHECK_INTERNED(s))
12981 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012983 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 return;
12985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012986 if (interned == NULL) {
12987 interned = PyDict_New();
12988 if (interned == NULL) {
12989 PyErr_Clear(); /* Don't leave an exception */
12990 return;
12991 }
12992 }
12993 /* It might be that the GetItem call fails even
12994 though the key is present in the dictionary,
12995 namely when this happens during a stack overflow. */
12996 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012998 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012999
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 if (t) {
13001 Py_INCREF(t);
13002 Py_DECREF(*p);
13003 *p = t;
13004 return;
13005 }
Walter Dörwald16807132007-05-25 13:52:07 +000013006
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 PyThreadState_GET()->recursion_critical = 1;
13008 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13009 PyErr_Clear();
13010 PyThreadState_GET()->recursion_critical = 0;
13011 return;
13012 }
13013 PyThreadState_GET()->recursion_critical = 0;
13014 /* The two references in interned are not counted by refcnt.
13015 The deallocator will take care of this */
13016 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013018}
13019
13020void
13021PyUnicode_InternImmortal(PyObject **p)
13022{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13024
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 PyUnicode_InternInPlace(p);
13026 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013028 Py_INCREF(*p);
13029 }
Walter Dörwald16807132007-05-25 13:52:07 +000013030}
13031
13032PyObject *
13033PyUnicode_InternFromString(const char *cp)
13034{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013035 PyObject *s = PyUnicode_FromString(cp);
13036 if (s == NULL)
13037 return NULL;
13038 PyUnicode_InternInPlace(&s);
13039 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013040}
13041
Alexander Belopolsky40018472011-02-26 01:02:56 +000013042void
13043_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013044{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013045 PyObject *keys;
13046 PyUnicodeObject *s;
13047 Py_ssize_t i, n;
13048 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013049
Benjamin Peterson14339b62009-01-31 16:36:08 +000013050 if (interned == NULL || !PyDict_Check(interned))
13051 return;
13052 keys = PyDict_Keys(interned);
13053 if (keys == NULL || !PyList_Check(keys)) {
13054 PyErr_Clear();
13055 return;
13056 }
Walter Dörwald16807132007-05-25 13:52:07 +000013057
Benjamin Peterson14339b62009-01-31 16:36:08 +000013058 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13059 detector, interned unicode strings are not forcibly deallocated;
13060 rather, we give them their stolen references back, and then clear
13061 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013062
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 n = PyList_GET_SIZE(keys);
13064 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013066 for (i = 0; i < n; i++) {
13067 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013068 if (PyUnicode_READY(s) == -1)
13069 fprintf(stderr, "could not ready string\n");
13070 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013071 case SSTATE_NOT_INTERNED:
13072 /* XXX Shouldn't happen */
13073 break;
13074 case SSTATE_INTERNED_IMMORTAL:
13075 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013077 break;
13078 case SSTATE_INTERNED_MORTAL:
13079 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013081 break;
13082 default:
13083 Py_FatalError("Inconsistent interned string state.");
13084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013086 }
13087 fprintf(stderr, "total size of all interned strings: "
13088 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13089 "mortal/immortal\n", mortal_size, immortal_size);
13090 Py_DECREF(keys);
13091 PyDict_Clear(interned);
13092 Py_DECREF(interned);
13093 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013094}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013095
13096
13097/********************* Unicode Iterator **************************/
13098
13099typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013100 PyObject_HEAD
13101 Py_ssize_t it_index;
13102 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013103} unicodeiterobject;
13104
13105static void
13106unicodeiter_dealloc(unicodeiterobject *it)
13107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 _PyObject_GC_UNTRACK(it);
13109 Py_XDECREF(it->it_seq);
13110 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013111}
13112
13113static int
13114unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013116 Py_VISIT(it->it_seq);
13117 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013118}
13119
13120static PyObject *
13121unicodeiter_next(unicodeiterobject *it)
13122{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013123 PyUnicodeObject *seq;
13124 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013125
Benjamin Peterson14339b62009-01-31 16:36:08 +000013126 assert(it != NULL);
13127 seq = it->it_seq;
13128 if (seq == NULL)
13129 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013130 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13133 int kind = PyUnicode_KIND(seq);
13134 void *data = PyUnicode_DATA(seq);
13135 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13136 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 if (item != NULL)
13138 ++it->it_index;
13139 return item;
13140 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013141
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 Py_DECREF(seq);
13143 it->it_seq = NULL;
13144 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013145}
13146
13147static PyObject *
13148unicodeiter_len(unicodeiterobject *it)
13149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 Py_ssize_t len = 0;
13151 if (it->it_seq)
13152 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13153 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013154}
13155
13156PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13157
13158static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013162};
13163
13164PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013165 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13166 "str_iterator", /* tp_name */
13167 sizeof(unicodeiterobject), /* tp_basicsize */
13168 0, /* tp_itemsize */
13169 /* methods */
13170 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13171 0, /* tp_print */
13172 0, /* tp_getattr */
13173 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013174 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013175 0, /* tp_repr */
13176 0, /* tp_as_number */
13177 0, /* tp_as_sequence */
13178 0, /* tp_as_mapping */
13179 0, /* tp_hash */
13180 0, /* tp_call */
13181 0, /* tp_str */
13182 PyObject_GenericGetAttr, /* tp_getattro */
13183 0, /* tp_setattro */
13184 0, /* tp_as_buffer */
13185 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13186 0, /* tp_doc */
13187 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13188 0, /* tp_clear */
13189 0, /* tp_richcompare */
13190 0, /* tp_weaklistoffset */
13191 PyObject_SelfIter, /* tp_iter */
13192 (iternextfunc)unicodeiter_next, /* tp_iternext */
13193 unicodeiter_methods, /* tp_methods */
13194 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013195};
13196
13197static PyObject *
13198unicode_iter(PyObject *seq)
13199{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013200 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013201
Benjamin Peterson14339b62009-01-31 16:36:08 +000013202 if (!PyUnicode_Check(seq)) {
13203 PyErr_BadInternalCall();
13204 return NULL;
13205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 if (PyUnicode_READY(seq) == -1)
13207 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13209 if (it == NULL)
13210 return NULL;
13211 it->it_index = 0;
13212 Py_INCREF(seq);
13213 it->it_seq = (PyUnicodeObject *)seq;
13214 _PyObject_GC_TRACK(it);
13215 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013216}
13217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013218#define UNIOP(x) Py_UNICODE_##x
13219#define UNIOP_t Py_UNICODE
13220#include "uniops.h"
13221#undef UNIOP
13222#undef UNIOP_t
13223#define UNIOP(x) Py_UCS4_##x
13224#define UNIOP_t Py_UCS4
13225#include "uniops.h"
13226#undef UNIOP
13227#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013228
Victor Stinner71133ff2010-09-01 23:43:53 +000013229Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013230PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013231{
13232 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13233 Py_UNICODE *copy;
13234 Py_ssize_t size;
13235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 if (!PyUnicode_Check(unicode)) {
13237 PyErr_BadArgument();
13238 return NULL;
13239 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013240 /* Ensure we won't overflow the size. */
13241 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13242 PyErr_NoMemory();
13243 return NULL;
13244 }
13245 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13246 size *= sizeof(Py_UNICODE);
13247 copy = PyMem_Malloc(size);
13248 if (copy == NULL) {
13249 PyErr_NoMemory();
13250 return NULL;
13251 }
13252 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13253 return copy;
13254}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013255
Georg Brandl66c221e2010-10-14 07:04:07 +000013256/* A _string module, to export formatter_parser and formatter_field_name_split
13257 to the string.Formatter class implemented in Python. */
13258
13259static PyMethodDef _string_methods[] = {
13260 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13261 METH_O, PyDoc_STR("split the argument as a field name")},
13262 {"formatter_parser", (PyCFunction) formatter_parser,
13263 METH_O, PyDoc_STR("parse the argument as a format string")},
13264 {NULL, NULL}
13265};
13266
13267static struct PyModuleDef _string_module = {
13268 PyModuleDef_HEAD_INIT,
13269 "_string",
13270 PyDoc_STR("string helper module"),
13271 0,
13272 _string_methods,
13273 NULL,
13274 NULL,
13275 NULL,
13276 NULL
13277};
13278
13279PyMODINIT_FUNC
13280PyInit__string(void)
13281{
13282 return PyModule_Create(&_string_module);
13283}
13284
13285
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013286#ifdef __cplusplus
13287}
13288#endif