blob: 68278e6040e79211ea187b56fd1ebd5798a9593c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Limit for the Unicode object free list */
50
Christian Heimes2202f872008-02-06 14:31:34 +000051#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
53/* Limit for the Unicode object free list stay alive optimization.
54
55 The implementation will keep allocated Unicode memory intact for
56 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000057 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000058
Christian Heimes2202f872008-02-06 14:31:34 +000059 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000060 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000061 malloc()-overhead) bytes of unused garbage.
62
63 Setting the limit to 0 effectively turns the feature off.
64
Guido van Rossumfd4b9572000-04-10 13:51:10 +000065 Note: This is an experimental feature ! If you get core dumps when
66 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000067
68*/
69
Guido van Rossumfd4b9572000-04-10 13:51:10 +000070#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
82 The globals are initialized by the _PyUnicode_Init() API and should
83 not be used before calling that API.
84
85*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000086
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000087
88#ifdef __cplusplus
89extern "C" {
90#endif
91
Victor Stinner910337b2011-10-03 03:20:16 +020092#ifdef Py_DEBUG
93# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020097
Victor Stinnere90fe6a2011-10-01 16:48:13 +020098#define _PyUnicode_UTF8(op) \
99 (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200101 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102 assert(PyUnicode_IS_READY(op)), \
103 PyUnicode_IS_COMPACT_ASCII(op) ? \
104 ((char*)((PyASCIIObject*)(op) + 1)) : \
105 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200106#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((PyASCIIObject*)(op))->length : \
113 _PyUnicode_UTF8_LENGTH(op))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200119#define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200121 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200122#define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200124 ((PyASCIIObject *)(op))->length)
Victor Stinnerc3c74152011-10-02 20:39:55 +0200125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200126
Victor Stinner910337b2011-10-03 03:20:16 +0200127#undef PyUnicode_READY
128#define PyUnicode_READY(op) \
129 (assert(_PyUnicode_CHECK(op)), \
130 (PyUnicode_IS_READY(op) ? \
131 0 : _PyUnicode_Ready((PyObject *)(op))))
132
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133/* true if the Unicode object has an allocated UTF-8 memory block
134 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200135#define _PyUnicode_HAS_UTF8_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (!PyUnicode_IS_COMPACT_ASCII(op) \
138 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200139 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
148 const from_type *iter_; to_type *to_; \
149 for (iter_ = (begin), to_ = (to_type *)(to); \
150 iter_ < (end); \
151 ++iter_, ++to_) { \
152 *to_ = (to_type)*iter_; \
153 } \
154 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200155
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200156/* The Unicode string has been modified: reset the hash */
157#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
158
Walter Dörwald16807132007-05-25 13:52:07 +0000159/* This dictionary holds all interned unicode strings. Note that references
160 to strings in this dictionary are *not* counted in the string's ob_refcnt.
161 When the interned string reaches a refcnt of 0 the string deallocation
162 function will delete the reference from this dictionary.
163
164 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000165 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000166*/
167static PyObject *interned;
168
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000169/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200170static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000171
172/* Single character Unicode strings in the Latin-1 range are being
173 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200174static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175
Christian Heimes190d79e2008-01-30 11:58:22 +0000176/* Fast detection of the most frequent whitespace characters */
177const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000178 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000179/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000180/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000181/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000182/* case 0x000C: * FORM FEED */
183/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000184 0, 1, 1, 1, 1, 1, 0, 0,
185 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000186/* case 0x001C: * FILE SEPARATOR */
187/* case 0x001D: * GROUP SEPARATOR */
188/* case 0x001E: * RECORD SEPARATOR */
189/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000191/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000192 1, 0, 0, 0, 0, 0, 0, 0,
193 0, 0, 0, 0, 0, 0, 0, 0,
194 0, 0, 0, 0, 0, 0, 0, 0,
195 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000196
Benjamin Peterson14339b62009-01-31 16:36:08 +0000197 0, 0, 0, 0, 0, 0, 0, 0,
198 0, 0, 0, 0, 0, 0, 0, 0,
199 0, 0, 0, 0, 0, 0, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
201 0, 0, 0, 0, 0, 0, 0, 0,
202 0, 0, 0, 0, 0, 0, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
204 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000205};
206
Victor Stinnerfe226c02011-10-03 03:52:20 +0200207static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
208
Alexander Belopolsky40018472011-02-26 01:02:56 +0000209static PyObject *
210unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000211 PyObject **errorHandler,const char *encoding, const char *reason,
212 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
213 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
214
Alexander Belopolsky40018472011-02-26 01:02:56 +0000215static void
216raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300217 const char *encoding,
218 const Py_UNICODE *unicode, Py_ssize_t size,
219 Py_ssize_t startpos, Py_ssize_t endpos,
220 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000221
Christian Heimes190d79e2008-01-30 11:58:22 +0000222/* Same for linebreaks */
223static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000225/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000226/* 0x000B, * LINE TABULATION */
227/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000228/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000229 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000231/* 0x001C, * FILE SEPARATOR */
232/* 0x001D, * GROUP SEPARATOR */
233/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000239
Benjamin Peterson14339b62009-01-31 16:36:08 +0000240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000248};
249
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300250/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
251 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000252Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000253PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000254{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000255#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000256 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 /* This is actually an illegal character, so it should
259 not be passed to unichr. */
260 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000261#endif
262}
263
Victor Stinner910337b2011-10-03 03:20:16 +0200264#ifdef Py_DEBUG
265static int
266_PyUnicode_CheckConsistency(void *op)
267{
268 PyASCIIObject *ascii;
269 unsigned int kind;
270
271 assert(PyUnicode_Check(op));
272
273 ascii = (PyASCIIObject *)op;
274 kind = ascii->state.kind;
275
276 if (ascii->state.ascii == 1) {
277 assert(kind == PyUnicode_1BYTE_KIND);
278 assert(ascii->state.compact == 1);
279 assert(ascii->state.ready == 1);
280 }
281 else if (ascii->state.compact == 1) {
282 assert(kind == PyUnicode_1BYTE_KIND
283 || kind == PyUnicode_2BYTE_KIND
284 || kind == PyUnicode_4BYTE_KIND);
285 assert(ascii->state.compact == 1);
286 assert(ascii->state.ascii == 0);
287 assert(ascii->state.ready == 1);
288 } else {
289 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
290 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
291
292 if (kind == PyUnicode_WCHAR_KIND) {
293 assert(!ascii->state.compact == 1);
294 assert(ascii->state.ascii == 0);
295 assert(!ascii->state.ready == 1);
296 assert(ascii->wstr != NULL);
297 assert(unicode->data.any == NULL);
298 assert(compact->utf8 == NULL);
299 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
300 }
301 else {
302 assert(kind == PyUnicode_1BYTE_KIND
303 || kind == PyUnicode_2BYTE_KIND
304 || kind == PyUnicode_4BYTE_KIND);
305 assert(!ascii->state.compact == 1);
306 assert(ascii->state.ready == 1);
307 assert(unicode->data.any != NULL);
308 assert(ascii->state.ascii == 0);
309 }
310 }
311 return 1;
312}
313#endif
314
Thomas Wouters477c8d52006-05-27 19:21:47 +0000315/* --- Bloom Filters ----------------------------------------------------- */
316
317/* stuff to implement simple "bloom filters" for Unicode characters.
318 to keep things simple, we use a single bitmask, using the least 5
319 bits from each unicode characters as the bit index. */
320
321/* the linebreak mask is set up by Unicode_Init below */
322
Antoine Pitrouf068f942010-01-13 14:19:12 +0000323#if LONG_BIT >= 128
324#define BLOOM_WIDTH 128
325#elif LONG_BIT >= 64
326#define BLOOM_WIDTH 64
327#elif LONG_BIT >= 32
328#define BLOOM_WIDTH 32
329#else
330#error "LONG_BIT is smaller than 32"
331#endif
332
Thomas Wouters477c8d52006-05-27 19:21:47 +0000333#define BLOOM_MASK unsigned long
334
335static BLOOM_MASK bloom_linebreak;
336
Antoine Pitrouf068f942010-01-13 14:19:12 +0000337#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
338#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000339
Benjamin Peterson29060642009-01-31 22:14:21 +0000340#define BLOOM_LINEBREAK(ch) \
341 ((ch) < 128U ? ascii_linebreak[(ch)] : \
342 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000343
Alexander Belopolsky40018472011-02-26 01:02:56 +0000344Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200345make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000346{
347 /* calculate simple bloom-style bitmask for a given unicode string */
348
Antoine Pitrouf068f942010-01-13 14:19:12 +0000349 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000350 Py_ssize_t i;
351
352 mask = 0;
353 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200354 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000355
356 return mask;
357}
358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200359#define BLOOM_MEMBER(mask, chr, str) \
360 (BLOOM(mask, chr) \
361 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000362
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363/* --- Unicode Object ----------------------------------------------------- */
364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200365static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200366fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
367
368Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
369 Py_ssize_t size, Py_UCS4 ch,
370 int direction)
371{
372 /* like wcschr, but doesn't stop at NULL characters */
373 Py_ssize_t i;
374 if (direction == 1) {
375 for(i = 0; i < size; i++)
376 if (PyUnicode_READ(kind, s, i) == ch)
377 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
378 }
379 else {
380 for(i = size-1; i >= 0; i--)
381 if (PyUnicode_READ(kind, s, i) == ch)
382 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
383 }
384 return NULL;
385}
386
Victor Stinnerfe226c02011-10-03 03:52:20 +0200387static PyObject*
388resize_compact(PyObject *unicode, Py_ssize_t length)
389{
390 Py_ssize_t char_size;
391 Py_ssize_t struct_size;
392 Py_ssize_t new_size;
393 int share_wstr;
394
395 assert(PyUnicode_IS_READY(unicode));
396 char_size = PyUnicode_CHARACTER_SIZE(unicode);
397 if (PyUnicode_IS_COMPACT_ASCII(unicode))
398 struct_size = sizeof(PyASCIIObject);
399 else
400 struct_size = sizeof(PyCompactUnicodeObject);
401 share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
402
403 _Py_DEC_REFTOTAL;
404 _Py_ForgetReference(unicode);
405
406 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
407 PyErr_NoMemory();
408 return NULL;
409 }
410 new_size = (struct_size + (length + 1) * char_size);
411
412 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
413 if (unicode == NULL) {
414 PyObject_Del(unicode);
415 PyErr_NoMemory();
416 return NULL;
417 }
418 _Py_NewReference(unicode);
419 _PyUnicode_LENGTH(unicode) = length;
420 if (share_wstr)
421 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
422 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
423 length, 0);
424 return unicode;
425}
426
Alexander Belopolsky40018472011-02-26 01:02:56 +0000427static int
Victor Stinnerfe226c02011-10-03 03:52:20 +0200428resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000429{
430 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432 assert(!PyUnicode_IS_COMPACT(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200433
Victor Stinnerfe226c02011-10-03 03:52:20 +0200434 assert(Py_REFCNT(unicode) == 1);
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200435 _PyUnicode_DIRTY(unicode);
Tim Petersced69f82003-09-16 20:30:58 +0000436
Victor Stinnerfe226c02011-10-03 03:52:20 +0200437 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
438 {
439 PyObject_DEL(_PyUnicode_UTF8(unicode));
440 _PyUnicode_UTF8(unicode) = NULL;
441 }
442
443 if (PyUnicode_IS_READY(unicode)) {
444 Py_ssize_t char_size;
445 Py_ssize_t new_size;
446 int share_wstr;
447 void *data;
448
449 data = _PyUnicode_DATA_ANY(unicode);
450 assert(data != NULL);
451 char_size = PyUnicode_CHARACTER_SIZE(unicode);
452 share_wstr = (_PyUnicode_WSTR(unicode) == data);
453
454 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
455 PyErr_NoMemory();
456 return -1;
457 }
458 new_size = (length + 1) * char_size;
459
460 data = (PyObject *)PyObject_REALLOC(data, new_size);
461 if (data == NULL) {
462 PyErr_NoMemory();
463 return -1;
464 }
465 _PyUnicode_DATA_ANY(unicode) = data;
466 if (share_wstr)
467 _PyUnicode_WSTR(unicode) = data;
468 _PyUnicode_LENGTH(unicode) = length;
469 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
470 if (share_wstr)
471 return 0;
472 }
473 if (_PyUnicode_WSTR(unicode) != NULL) {
474 assert(_PyUnicode_WSTR(unicode) != NULL);
475
476 oldstr = _PyUnicode_WSTR(unicode);
477 _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
478 sizeof(Py_UNICODE) * (length + 1));
479 if (!_PyUnicode_WSTR(unicode)) {
480 _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
481 PyErr_NoMemory();
482 return -1;
483 }
484 _PyUnicode_WSTR(unicode)[length] = 0;
485 _PyUnicode_WSTR_LENGTH(unicode) = length;
486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000487 return 0;
488}
489
Victor Stinnerfe226c02011-10-03 03:52:20 +0200490static PyObject*
491resize_copy(PyObject *unicode, Py_ssize_t length)
492{
493 Py_ssize_t copy_length;
494 if (PyUnicode_IS_COMPACT(unicode)) {
495 PyObject *copy;
496 assert(PyUnicode_IS_READY(unicode));
497
498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
499 if (copy == NULL)
500 return NULL;
501
502 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
503 if (PyUnicode_CopyCharacters(copy, 0,
504 unicode, 0,
505 copy_length) < 0)
506 {
507 Py_DECREF(copy);
508 return NULL;
509 }
510 return copy;
511 } else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200512 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200513 assert(_PyUnicode_WSTR(unicode) != NULL);
514 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200515 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200516 if (w == NULL)
517 return NULL;
518 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
519 copy_length = Py_MIN(copy_length, length);
520 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
521 copy_length);
522 return (PyObject*)w;
523 }
524}
525
Guido van Rossumd57fd912000-03-10 22:53:23 +0000526/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000527 Ux0000 terminated; some code (e.g. new_identifier)
528 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529
530 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000531 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532
533*/
534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535#ifdef Py_DEBUG
536int unicode_old_new_calls = 0;
537#endif
538
Alexander Belopolsky40018472011-02-26 01:02:56 +0000539static PyUnicodeObject *
540_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541{
542 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000544
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 if (length == 0 && unicode_empty != NULL) {
547 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200548 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000549 }
550
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000551 /* Ensure we won't overflow the size. */
552 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
553 return (PyUnicodeObject *)PyErr_NoMemory();
554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 if (length < 0) {
556 PyErr_SetString(PyExc_SystemError,
557 "Negative size passed to _PyUnicode_New");
558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 }
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#ifdef Py_DEBUG
562 ++unicode_old_new_calls;
563#endif
564
565 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
566 if (unicode == NULL)
567 return NULL;
568 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
569 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
570 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000571 PyErr_NoMemory();
572 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000573 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574
Jeremy Hyltond8082792003-09-16 19:41:39 +0000575 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000576 * the caller fails before initializing str -- unicode_resize()
577 * reads str[0], and the Keep-Alive optimization can keep memory
578 * allocated for str alive across a call to unicode_dealloc(unicode).
579 * We don't want unicode_resize to read uninitialized memory in
580 * that case.
581 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200582 _PyUnicode_WSTR(unicode)[0] = 0;
583 _PyUnicode_WSTR(unicode)[length] = 0;
584 _PyUnicode_WSTR_LENGTH(unicode) = length;
585 _PyUnicode_HASH(unicode) = -1;
586 _PyUnicode_STATE(unicode).interned = 0;
587 _PyUnicode_STATE(unicode).kind = 0;
588 _PyUnicode_STATE(unicode).compact = 0;
589 _PyUnicode_STATE(unicode).ready = 0;
590 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200591 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200593 _PyUnicode_UTF8(unicode) = NULL;
594 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000596
Benjamin Peterson29060642009-01-31 22:14:21 +0000597 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000598 /* XXX UNREF/NEWREF interface should be more symmetrical */
599 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000600 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000601 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000602 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603}
604
Victor Stinnerf42dc442011-10-02 23:33:16 +0200605static const char*
606unicode_kind_name(PyObject *unicode)
607{
Victor Stinner910337b2011-10-03 03:20:16 +0200608 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerf42dc442011-10-02 23:33:16 +0200609 if (!PyUnicode_IS_COMPACT(unicode))
610 {
611 if (!PyUnicode_IS_READY(unicode))
612 return "wstr";
613 switch(PyUnicode_KIND(unicode))
614 {
615 case PyUnicode_1BYTE_KIND:
616 if (PyUnicode_IS_COMPACT_ASCII(unicode))
617 return "legacy ascii";
618 else
619 return "legacy latin1";
620 case PyUnicode_2BYTE_KIND:
621 return "legacy UCS2";
622 case PyUnicode_4BYTE_KIND:
623 return "legacy UCS4";
624 default:
625 return "<legacy invalid kind>";
626 }
627 }
628 assert(PyUnicode_IS_READY(unicode));
629 switch(PyUnicode_KIND(unicode))
630 {
631 case PyUnicode_1BYTE_KIND:
632 if (PyUnicode_IS_COMPACT_ASCII(unicode))
633 return "ascii";
634 else
635 return "compact latin1";
636 case PyUnicode_2BYTE_KIND:
637 return "compact UCS2";
638 case PyUnicode_4BYTE_KIND:
639 return "compact UCS4";
640 default:
641 return "<invalid compact kind>";
642 }
643}
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645#ifdef Py_DEBUG
646int unicode_new_new_calls = 0;
647
648/* Functions wrapping macros for use in debugger */
649char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200650 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651}
652
653void *_PyUnicode_compact_data(void *unicode) {
654 return _PyUnicode_COMPACT_DATA(unicode);
655}
656void *_PyUnicode_data(void *unicode){
657 printf("obj %p\n", unicode);
658 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
659 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
660 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
661 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
662 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
663 return PyUnicode_DATA(unicode);
664}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200665
666void
667_PyUnicode_Dump(PyObject *op)
668{
669 PyASCIIObject *ascii = (PyASCIIObject *)op;
670 printf("%s: len=%zu, wstr=%p",
671 unicode_kind_name(op),
672 ascii->length,
673 ascii->wstr);
674 if (!ascii->state.ascii) {
675 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
676 printf(" (%zu), utf8=%p (%zu)",
677 compact->wstr_length,
678 compact->utf8,
679 compact->utf8_length);
680 }
681 if (!ascii->state.compact) {
682 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
683 printf(", data=%p",
684 unicode->data.any);
685 }
686 printf("\n");
687}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200688#endif
689
690PyObject *
691PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
692{
693 PyObject *obj;
694 PyCompactUnicodeObject *unicode;
695 void *data;
696 int kind_state;
697 int is_sharing = 0, is_ascii = 0;
698 Py_ssize_t char_size;
699 Py_ssize_t struct_size;
700
701 /* Optimization for empty strings */
702 if (size == 0 && unicode_empty != NULL) {
703 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200704 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200705 }
706
707#ifdef Py_DEBUG
708 ++unicode_new_new_calls;
709#endif
710
711 struct_size = sizeof(PyCompactUnicodeObject);
712 if (maxchar < 128) {
713 kind_state = PyUnicode_1BYTE_KIND;
714 char_size = 1;
715 is_ascii = 1;
716 struct_size = sizeof(PyASCIIObject);
717 }
718 else if (maxchar < 256) {
719 kind_state = PyUnicode_1BYTE_KIND;
720 char_size = 1;
721 }
722 else if (maxchar < 65536) {
723 kind_state = PyUnicode_2BYTE_KIND;
724 char_size = 2;
725 if (sizeof(wchar_t) == 2)
726 is_sharing = 1;
727 }
728 else {
729 kind_state = PyUnicode_4BYTE_KIND;
730 char_size = 4;
731 if (sizeof(wchar_t) == 4)
732 is_sharing = 1;
733 }
734
735 /* Ensure we won't overflow the size. */
736 if (size < 0) {
737 PyErr_SetString(PyExc_SystemError,
738 "Negative size passed to PyUnicode_New");
739 return NULL;
740 }
741 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
742 return PyErr_NoMemory();
743
744 /* Duplicated allocation code from _PyObject_New() instead of a call to
745 * PyObject_New() so we are able to allocate space for the object and
746 * it's data buffer.
747 */
748 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
749 if (obj == NULL)
750 return PyErr_NoMemory();
751 obj = PyObject_INIT(obj, &PyUnicode_Type);
752 if (obj == NULL)
753 return NULL;
754
755 unicode = (PyCompactUnicodeObject *)obj;
756 if (is_ascii)
757 data = ((PyASCIIObject*)obj) + 1;
758 else
759 data = unicode + 1;
760 _PyUnicode_LENGTH(unicode) = size;
761 _PyUnicode_HASH(unicode) = -1;
762 _PyUnicode_STATE(unicode).interned = 0;
763 _PyUnicode_STATE(unicode).kind = kind_state;
764 _PyUnicode_STATE(unicode).compact = 1;
765 _PyUnicode_STATE(unicode).ready = 1;
766 _PyUnicode_STATE(unicode).ascii = is_ascii;
767 if (is_ascii) {
768 ((char*)data)[size] = 0;
769 _PyUnicode_WSTR(unicode) = NULL;
770 }
771 else if (kind_state == PyUnicode_1BYTE_KIND) {
772 ((char*)data)[size] = 0;
773 _PyUnicode_WSTR(unicode) = NULL;
774 _PyUnicode_WSTR_LENGTH(unicode) = 0;
775 unicode->utf8_length = 0;
776 unicode->utf8 = NULL;
777 }
778 else {
779 unicode->utf8 = NULL;
780 if (kind_state == PyUnicode_2BYTE_KIND)
781 ((Py_UCS2*)data)[size] = 0;
782 else /* kind_state == PyUnicode_4BYTE_KIND */
783 ((Py_UCS4*)data)[size] = 0;
784 if (is_sharing) {
785 _PyUnicode_WSTR_LENGTH(unicode) = size;
786 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
787 }
788 else {
789 _PyUnicode_WSTR_LENGTH(unicode) = 0;
790 _PyUnicode_WSTR(unicode) = NULL;
791 }
792 }
793 return obj;
794}
795
796#if SIZEOF_WCHAR_T == 2
797/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
798 will decode surrogate pairs, the other conversions are implemented as macros
799 for efficency.
800
801 This function assumes that unicode can hold one more code point than wstr
802 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200803static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200804unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
805 PyUnicodeObject *unicode)
806{
807 const wchar_t *iter;
808 Py_UCS4 *ucs4_out;
809
Victor Stinner910337b2011-10-03 03:20:16 +0200810 assert(unicode != NULL);
811 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
813 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
814
815 for (iter = begin; iter < end; ) {
816 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
817 _PyUnicode_GET_LENGTH(unicode)));
818 if (*iter >= 0xD800 && *iter <= 0xDBFF
819 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
820 {
821 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
822 iter += 2;
823 }
824 else {
825 *ucs4_out++ = *iter;
826 iter++;
827 }
828 }
829 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
830 _PyUnicode_GET_LENGTH(unicode)));
831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832}
833#endif
834
Victor Stinnercd9950f2011-10-02 00:34:53 +0200835static int
836_PyUnicode_Dirty(PyObject *unicode)
837{
Victor Stinner910337b2011-10-03 03:20:16 +0200838 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200839 if (Py_REFCNT(unicode) != 1) {
840 PyErr_SetString(PyExc_ValueError,
841 "Cannot modify a string having more than 1 reference");
842 return -1;
843 }
844 _PyUnicode_DIRTY(unicode);
845 return 0;
846}
847
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200848Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200849PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
850 PyObject *from, Py_ssize_t from_start,
851 Py_ssize_t how_many)
852{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200853 unsigned int from_kind, to_kind;
854 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200855
Victor Stinnerb1536152011-09-30 02:26:10 +0200856 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
857 PyErr_BadInternalCall();
858 return -1;
859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860
861 if (PyUnicode_READY(from))
862 return -1;
863 if (PyUnicode_READY(to))
864 return -1;
865
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200866 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200867 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
868 PyErr_Format(PyExc_ValueError,
869 "Cannot write %zi characters at %zi "
870 "in a string of %zi characters",
871 how_many, to_start, PyUnicode_GET_LENGTH(to));
872 return -1;
873 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200874 if (how_many == 0)
875 return 0;
876
Victor Stinnercd9950f2011-10-02 00:34:53 +0200877 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200878 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200881 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200883 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200884
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 if (from_kind == to_kind
886 /* deny latin1 => ascii */
887 && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
888 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200889 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200890 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200891 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200892 + PyUnicode_KIND_SIZE(from_kind, from_start),
893 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200895 else if (from_kind == PyUnicode_1BYTE_KIND
896 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200897 {
898 _PyUnicode_CONVERT_BYTES(
899 Py_UCS1, Py_UCS2,
900 PyUnicode_1BYTE_DATA(from) + from_start,
901 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
902 PyUnicode_2BYTE_DATA(to) + to_start
903 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200904 }
Victor Stinner157f83f2011-09-28 21:41:31 +0200905 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200906 && to_kind == PyUnicode_4BYTE_KIND)
907 {
908 _PyUnicode_CONVERT_BYTES(
909 Py_UCS1, Py_UCS4,
910 PyUnicode_1BYTE_DATA(from) + from_start,
911 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
912 PyUnicode_4BYTE_DATA(to) + to_start
913 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200914 }
915 else if (from_kind == PyUnicode_2BYTE_KIND
916 && to_kind == PyUnicode_4BYTE_KIND)
917 {
918 _PyUnicode_CONVERT_BYTES(
919 Py_UCS2, Py_UCS4,
920 PyUnicode_2BYTE_DATA(from) + from_start,
921 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
922 PyUnicode_4BYTE_DATA(to) + to_start
923 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200924 }
Victor Stinnera0702ab2011-09-29 14:14:38 +0200925 else {
926 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +0200927
928 /* check if max_char(from substring) <= max_char(to) */
929 if (from_kind > to_kind
930 /* latin1 => ascii */
931 || (PyUnicode_IS_COMPACT_ASCII(to)
932 && to_kind == PyUnicode_1BYTE_KIND
933 && !PyUnicode_IS_COMPACT_ASCII(from)))
934 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200935 /* slow path to check for character overflow */
936 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
937 Py_UCS4 ch, maxchar;
938 Py_ssize_t i;
939
940 maxchar = 0;
941 invalid_kinds = 0;
942 for (i=0; i < how_many; i++) {
943 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
944 if (ch > maxchar) {
945 maxchar = ch;
946 if (maxchar > to_maxchar) {
947 invalid_kinds = 1;
948 break;
949 }
950 }
951 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
952 }
953 }
954 else
955 invalid_kinds = 1;
956 if (invalid_kinds) {
957 PyErr_Format(PyExc_ValueError,
Victor Stinnerf42dc442011-10-02 23:33:16 +0200958 "Cannot copy %s characters "
959 "into a string of %s characters",
960 unicode_kind_name(from),
961 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +0200962 return -1;
963 }
964 }
965 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966}
967
Victor Stinner17222162011-09-28 22:15:37 +0200968/* Find the maximum code point and count the number of surrogate pairs so a
969 correct string length can be computed before converting a string to UCS4.
970 This function counts single surrogates as a character and not as a pair.
971
972 Return 0 on success, or -1 on error. */
973static int
974find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
975 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976{
977 const wchar_t *iter;
978
Victor Stinnerc53be962011-10-02 21:33:54 +0200979 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980 if (num_surrogates == NULL || maxchar == NULL) {
981 PyErr_SetString(PyExc_SystemError,
982 "unexpected NULL arguments to "
983 "PyUnicode_FindMaxCharAndNumSurrogatePairs");
984 return -1;
985 }
986
987 *num_surrogates = 0;
988 *maxchar = 0;
989
990 for (iter = begin; iter < end; ) {
991 if (*iter > *maxchar)
992 *maxchar = *iter;
993#if SIZEOF_WCHAR_T == 2
994 if (*iter >= 0xD800 && *iter <= 0xDBFF
995 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
996 {
997 Py_UCS4 surrogate_val;
998 surrogate_val = (((iter[0] & 0x3FF)<<10)
999 | (iter[1] & 0x3FF)) + 0x10000;
1000 ++(*num_surrogates);
1001 if (surrogate_val > *maxchar)
1002 *maxchar = surrogate_val;
1003 iter += 2;
1004 }
1005 else
1006 iter++;
1007#else
1008 iter++;
1009#endif
1010 }
1011 return 0;
1012}
1013
1014#ifdef Py_DEBUG
1015int unicode_ready_calls = 0;
1016#endif
1017
1018int
Victor Stinnerd8f65102011-09-29 19:43:17 +02001019_PyUnicode_Ready(PyObject *obj)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001020{
Victor Stinnerd8f65102011-09-29 19:43:17 +02001021 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001022 wchar_t *end;
1023 Py_UCS4 maxchar = 0;
1024 Py_ssize_t num_surrogates;
1025#if SIZEOF_WCHAR_T == 2
1026 Py_ssize_t length_wo_surrogates;
1027#endif
1028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 /* _PyUnicode_Ready() is only intented for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001030 strings were created using _PyObject_New() and where no canonical
1031 representation (the str field) has been set yet aka strings
1032 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001033 assert(_PyUnicode_CHECK(unicode));
1034 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001036 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001037 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001038 /* Actually, it should neither be interned nor be anything else: */
1039 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040
1041#ifdef Py_DEBUG
1042 ++unicode_ready_calls;
1043#endif
1044
1045 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001046 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001047 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049
1050 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001051 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1052 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 PyErr_NoMemory();
1054 return -1;
1055 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001056 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 _PyUnicode_WSTR(unicode), end,
1058 PyUnicode_1BYTE_DATA(unicode));
1059 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1060 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1061 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1062 if (maxchar < 128) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001063 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001064 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 }
1066 else {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001067 _PyUnicode_UTF8(unicode) = NULL;
1068 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 }
1070 PyObject_FREE(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
1072 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1073 }
1074 /* In this case we might have to convert down from 4-byte native
1075 wchar_t to 2-byte unicode. */
1076 else if (maxchar < 65536) {
1077 assert(num_surrogates == 0 &&
1078 "FindMaxCharAndNumSurrogatePairs() messed up");
1079
Victor Stinner506f5922011-09-28 22:34:18 +02001080#if SIZEOF_WCHAR_T == 2
1081 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001082 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001083 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1084 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1085 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001086 _PyUnicode_UTF8(unicode) = NULL;
1087 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001088#else
1089 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001090 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001091 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001092 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001093 PyErr_NoMemory();
1094 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 }
Victor Stinner506f5922011-09-28 22:34:18 +02001096 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1097 _PyUnicode_WSTR(unicode), end,
1098 PyUnicode_2BYTE_DATA(unicode));
1099 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1100 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1101 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001102 _PyUnicode_UTF8(unicode) = NULL;
1103 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001104 PyObject_FREE(_PyUnicode_WSTR(unicode));
1105 _PyUnicode_WSTR(unicode) = NULL;
1106 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1107#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 }
1109 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1110 else {
1111#if SIZEOF_WCHAR_T == 2
1112 /* in case the native representation is 2-bytes, we need to allocate a
1113 new normalized 4-byte version. */
1114 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001115 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1116 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 PyErr_NoMemory();
1118 return -1;
1119 }
1120 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1121 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001122 _PyUnicode_UTF8(unicode) = NULL;
1123 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001124 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1125 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001126 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 PyObject_FREE(_PyUnicode_WSTR(unicode));
1128 _PyUnicode_WSTR(unicode) = NULL;
1129 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1130#else
1131 assert(num_surrogates == 0);
1132
Victor Stinnerc3c74152011-10-02 20:39:55 +02001133 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001135 _PyUnicode_UTF8(unicode) = NULL;
1136 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1138#endif
1139 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1140 }
1141 _PyUnicode_STATE(unicode).ready = 1;
1142 return 0;
1143}
1144
Alexander Belopolsky40018472011-02-26 01:02:56 +00001145static void
1146unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147{
Walter Dörwald16807132007-05-25 13:52:07 +00001148 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001149 case SSTATE_NOT_INTERNED:
1150 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001151
Benjamin Peterson29060642009-01-31 22:14:21 +00001152 case SSTATE_INTERNED_MORTAL:
1153 /* revive dead object temporarily for DelItem */
1154 Py_REFCNT(unicode) = 3;
1155 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1156 Py_FatalError(
1157 "deletion of interned string failed");
1158 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001159
Benjamin Peterson29060642009-01-31 22:14:21 +00001160 case SSTATE_INTERNED_IMMORTAL:
1161 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001162
Benjamin Peterson29060642009-01-31 22:14:21 +00001163 default:
1164 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001165 }
1166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 if (_PyUnicode_WSTR(unicode) &&
1168 (!PyUnicode_IS_READY(unicode) ||
1169 _PyUnicode_WSTR(unicode) != PyUnicode_DATA(unicode)))
1170 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001171 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001172 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173
1174 if (PyUnicode_IS_COMPACT(unicode)) {
1175 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 }
1177 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001178 if (_PyUnicode_DATA_ANY(unicode))
1179 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001180 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 }
1182}
1183
Alexander Belopolsky40018472011-02-26 01:02:56 +00001184static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001185unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001186{
Victor Stinnera3be6132011-10-03 02:16:37 +02001187 Py_ssize_t len;
Victor Stinnerca4f7a42011-10-03 04:18:04 +02001188#if SIZEOF_WCHAR_T == 2
1189 /* FIXME: unicode_resize() is buggy on Windows */
1190 return 0;
1191#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001192 if (Py_REFCNT(unicode) != 1)
1193 return 0;
1194 if (PyUnicode_CHECK_INTERNED(unicode))
1195 return 0;
1196 if (unicode == unicode_empty)
1197 return 0;
Victor Stinnera3be6132011-10-03 02:16:37 +02001198 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1199 len = PyUnicode_WSTR_LENGTH(unicode);
1200 else
1201 len = PyUnicode_GET_LENGTH(unicode);
1202 if (len == 1) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001203 Py_UCS4 ch;
Victor Stinnera3be6132011-10-03 02:16:37 +02001204 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001205 ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnera3be6132011-10-03 02:16:37 +02001206 else
1207 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001208 if (ch < 256 && unicode_latin1[ch] == unicode)
1209 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001210 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001211 /* FIXME: reenable resize_inplace */
1212 if (!PyUnicode_IS_COMPACT(unicode))
1213 return 0;
1214 return 1;
1215}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001216
Victor Stinnerfe226c02011-10-03 03:52:20 +02001217static int
1218unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1219{
1220 PyObject *unicode;
1221 Py_ssize_t old_length;
1222
1223 assert(p_unicode != NULL);
1224 unicode = *p_unicode;
1225
1226 assert(unicode != NULL);
1227 assert(PyUnicode_Check(unicode));
1228 assert(0 <= length);
1229
Victor Stinner910337b2011-10-03 03:20:16 +02001230 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001231 old_length = PyUnicode_WSTR_LENGTH(unicode);
1232 else
1233 old_length = PyUnicode_GET_LENGTH(unicode);
1234 if (old_length == length)
1235 return 0;
1236
1237 /* FIXME: really create a new object? */
1238 if (!unicode_resizable(unicode)) {
1239 PyObject *copy = resize_copy(unicode, length);
1240 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001241 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001242 Py_DECREF(*p_unicode);
1243 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001244 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001245 }
1246
Victor Stinnerfe226c02011-10-03 03:52:20 +02001247 if (PyUnicode_IS_COMPACT(unicode)) {
1248 *p_unicode = resize_compact(unicode, length);
1249 if (*p_unicode == NULL)
1250 return -1;
1251 return 0;
1252 } else
1253 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001254}
1255
Alexander Belopolsky40018472011-02-26 01:02:56 +00001256int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001257PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001258{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001259 PyObject *unicode;
1260 if (p_unicode == NULL) {
1261 PyErr_BadInternalCall();
1262 return -1;
1263 }
1264 unicode = *p_unicode;
1265 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1266 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1267 {
1268 PyErr_BadInternalCall();
1269 return -1;
1270 }
1271 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001272}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001274static PyObject*
1275get_latin1_char(unsigned char ch)
1276{
Victor Stinnera464fc12011-10-02 20:39:30 +02001277 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001279 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280 if (!unicode)
1281 return NULL;
1282 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1283 unicode_latin1[ch] = unicode;
1284 }
1285 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001286 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001287}
1288
Alexander Belopolsky40018472011-02-26 01:02:56 +00001289PyObject *
1290PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291{
1292 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293 Py_UCS4 maxchar = 0;
1294 Py_ssize_t num_surrogates;
1295
1296 if (u == NULL)
1297 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001299 /* If the Unicode data is known at construction time, we can apply
1300 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302 /* Optimization for empty strings */
1303 if (size == 0 && unicode_empty != NULL) {
1304 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001305 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001306 }
Tim Petersced69f82003-09-16 20:30:58 +00001307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 /* Single character Unicode objects in the Latin-1 range are
1309 shared when using this constructor */
1310 if (size == 1 && *u < 256)
1311 return get_latin1_char((unsigned char)*u);
1312
1313 /* If not empty and not single character, copy the Unicode data
1314 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001315 if (find_maxchar_surrogates(u, u + size,
1316 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 return NULL;
1318
1319 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1320 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 if (!unicode)
1322 return NULL;
1323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 switch (PyUnicode_KIND(unicode)) {
1325 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001326 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1328 break;
1329 case PyUnicode_2BYTE_KIND:
1330#if Py_UNICODE_SIZE == 2
1331 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1332#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001333 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1335#endif
1336 break;
1337 case PyUnicode_4BYTE_KIND:
1338#if SIZEOF_WCHAR_T == 2
1339 /* This is the only case which has to process surrogates, thus
1340 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001341 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342#else
1343 assert(num_surrogates == 0);
1344 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1345#endif
1346 break;
1347 default:
1348 assert(0 && "Impossible state");
1349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350
1351 return (PyObject *)unicode;
1352}
1353
Alexander Belopolsky40018472011-02-26 01:02:56 +00001354PyObject *
1355PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001356{
1357 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001358
Benjamin Peterson14339b62009-01-31 16:36:08 +00001359 if (size < 0) {
1360 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001361 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001362 return NULL;
1363 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001364
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001365 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001366 some optimizations which share commonly used objects.
1367 Also, this means the input must be UTF-8, so fall back to the
1368 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001369 if (u != NULL) {
1370
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* Optimization for empty strings */
1372 if (size == 0 && unicode_empty != NULL) {
1373 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001374 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001376
1377 /* Single characters are shared when using this constructor.
1378 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 if (size == 1 && Py_CHARMASK(*u) < 128)
1380 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001381
1382 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001383 }
1384
Walter Dörwald55507312007-05-18 13:12:10 +00001385 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001386 if (!unicode)
1387 return NULL;
1388
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001389 return (PyObject *)unicode;
1390}
1391
Alexander Belopolsky40018472011-02-26 01:02:56 +00001392PyObject *
1393PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001394{
1395 size_t size = strlen(u);
1396 if (size > PY_SSIZE_T_MAX) {
1397 PyErr_SetString(PyExc_OverflowError, "input too long");
1398 return NULL;
1399 }
1400
1401 return PyUnicode_FromStringAndSize(u, size);
1402}
1403
Victor Stinnere57b1c02011-09-28 22:20:48 +02001404static PyObject*
1405_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407 PyObject *res;
1408 unsigned char max = 127;
1409 Py_ssize_t i;
1410 for (i = 0; i < size; i++) {
1411 if (u[i] & 0x80) {
1412 max = 255;
1413 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001414 }
1415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 res = PyUnicode_New(size, max);
1417 if (!res)
1418 return NULL;
1419 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1420 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001421}
1422
Victor Stinnere57b1c02011-09-28 22:20:48 +02001423static PyObject*
1424_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425{
1426 PyObject *res;
1427 Py_UCS2 max = 0;
1428 Py_ssize_t i;
1429 for (i = 0; i < size; i++)
1430 if (u[i] > max)
1431 max = u[i];
1432 res = PyUnicode_New(size, max);
1433 if (!res)
1434 return NULL;
1435 if (max >= 256)
1436 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1437 else
1438 for (i = 0; i < size; i++)
1439 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1440 return res;
1441}
1442
Victor Stinnere57b1c02011-09-28 22:20:48 +02001443static PyObject*
1444_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445{
1446 PyObject *res;
1447 Py_UCS4 max = 0;
1448 Py_ssize_t i;
1449 for (i = 0; i < size; i++)
1450 if (u[i] > max)
1451 max = u[i];
1452 res = PyUnicode_New(size, max);
1453 if (!res)
1454 return NULL;
1455 if (max >= 0x10000)
1456 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1457 else {
1458 int kind = PyUnicode_KIND(res);
1459 void *data = PyUnicode_DATA(res);
1460 for (i = 0; i < size; i++)
1461 PyUnicode_WRITE(kind, data, i, u[i]);
1462 }
1463 return res;
1464}
1465
1466PyObject*
1467PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1468{
1469 switch(kind) {
1470 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001471 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001473 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001475 return _PyUnicode_FromUCS4(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 }
Victor Stinner202b62b2011-10-01 23:48:37 +02001477 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 return NULL;
1479}
1480
Victor Stinner034f6cf2011-09-30 02:26:44 +02001481PyObject*
1482PyUnicode_Copy(PyObject *unicode)
1483{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001484 Py_ssize_t size;
1485 PyObject *copy;
1486 void *data;
1487
Victor Stinner034f6cf2011-09-30 02:26:44 +02001488 if (!PyUnicode_Check(unicode)) {
1489 PyErr_BadInternalCall();
1490 return NULL;
1491 }
1492 if (PyUnicode_READY(unicode))
1493 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001494
1495 size = PyUnicode_GET_LENGTH(unicode);
1496 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1497 if (!copy)
1498 return NULL;
1499 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1500
1501 data = PyUnicode_DATA(unicode);
1502 switch (PyUnicode_KIND(unicode))
1503 {
1504 case PyUnicode_1BYTE_KIND:
1505 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1506 break;
1507 case PyUnicode_2BYTE_KIND:
1508 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1509 break;
1510 case PyUnicode_4BYTE_KIND:
1511 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1512 break;
1513 default:
1514 assert(0);
1515 break;
1516 }
1517 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001518}
1519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520
Victor Stinnerbc603d12011-10-02 01:00:40 +02001521/* Widen Unicode objects to larger buffers. Don't write terminating null
1522 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523
1524void*
1525_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1526{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001527 Py_ssize_t len;
1528 void *result;
1529 unsigned int skind;
1530
1531 if (PyUnicode_READY(s))
1532 return NULL;
1533
1534 len = PyUnicode_GET_LENGTH(s);
1535 skind = PyUnicode_KIND(s);
1536 if (skind >= kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001537 PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1538 return NULL;
1539 }
1540 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001541 case PyUnicode_2BYTE_KIND:
1542 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1543 if (!result)
1544 return PyErr_NoMemory();
1545 assert(skind == PyUnicode_1BYTE_KIND);
1546 _PyUnicode_CONVERT_BYTES(
1547 Py_UCS1, Py_UCS2,
1548 PyUnicode_1BYTE_DATA(s),
1549 PyUnicode_1BYTE_DATA(s) + len,
1550 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001552 case PyUnicode_4BYTE_KIND:
1553 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1554 if (!result)
1555 return PyErr_NoMemory();
1556 if (skind == PyUnicode_2BYTE_KIND) {
1557 _PyUnicode_CONVERT_BYTES(
1558 Py_UCS2, Py_UCS4,
1559 PyUnicode_2BYTE_DATA(s),
1560 PyUnicode_2BYTE_DATA(s) + len,
1561 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001563 else {
1564 assert(skind == PyUnicode_1BYTE_KIND);
1565 _PyUnicode_CONVERT_BYTES(
1566 Py_UCS1, Py_UCS4,
1567 PyUnicode_1BYTE_DATA(s),
1568 PyUnicode_1BYTE_DATA(s) + len,
1569 result);
1570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001572 default:
1573 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001575 PyErr_SetString(PyExc_ValueError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 return NULL;
1577}
1578
1579static Py_UCS4*
1580as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1581 int copy_null)
1582{
1583 int kind;
1584 void *data;
1585 Py_ssize_t len, targetlen;
1586 if (PyUnicode_READY(string) == -1)
1587 return NULL;
1588 kind = PyUnicode_KIND(string);
1589 data = PyUnicode_DATA(string);
1590 len = PyUnicode_GET_LENGTH(string);
1591 targetlen = len;
1592 if (copy_null)
1593 targetlen++;
1594 if (!target) {
1595 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1596 PyErr_NoMemory();
1597 return NULL;
1598 }
1599 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1600 if (!target) {
1601 PyErr_NoMemory();
1602 return NULL;
1603 }
1604 }
1605 else {
1606 if (targetsize < targetlen) {
1607 PyErr_Format(PyExc_SystemError,
1608 "string is longer than the buffer");
1609 if (copy_null && 0 < targetsize)
1610 target[0] = 0;
1611 return NULL;
1612 }
1613 }
1614 if (kind != PyUnicode_4BYTE_KIND) {
1615 Py_ssize_t i;
1616 for (i = 0; i < len; i++)
1617 target[i] = PyUnicode_READ(kind, data, i);
1618 }
1619 else
1620 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1621 if (copy_null)
1622 target[len] = 0;
1623 return target;
1624}
1625
1626Py_UCS4*
1627PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1628 int copy_null)
1629{
1630 if (target == NULL || targetsize < 1) {
1631 PyErr_BadInternalCall();
1632 return NULL;
1633 }
1634 return as_ucs4(string, target, targetsize, copy_null);
1635}
1636
1637Py_UCS4*
1638PyUnicode_AsUCS4Copy(PyObject *string)
1639{
1640 return as_ucs4(string, NULL, 0, 1);
1641}
1642
1643#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001644
Alexander Belopolsky40018472011-02-26 01:02:56 +00001645PyObject *
1646PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001649 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001651 PyErr_BadInternalCall();
1652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 }
1654
Martin v. Löwis790465f2008-04-05 20:41:37 +00001655 if (size == -1) {
1656 size = wcslen(w);
1657 }
1658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001659 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660}
1661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001662#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001663
Walter Dörwald346737f2007-05-31 10:44:43 +00001664static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001665makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1666 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001667{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001668 *fmt++ = '%';
1669 if (width) {
1670 if (zeropad)
1671 *fmt++ = '0';
1672 fmt += sprintf(fmt, "%d", width);
1673 }
1674 if (precision)
1675 fmt += sprintf(fmt, ".%d", precision);
1676 if (longflag)
1677 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001678 else if (longlongflag) {
1679 /* longlongflag should only ever be nonzero on machines with
1680 HAVE_LONG_LONG defined */
1681#ifdef HAVE_LONG_LONG
1682 char *f = PY_FORMAT_LONG_LONG;
1683 while (*f)
1684 *fmt++ = *f++;
1685#else
1686 /* we shouldn't ever get here */
1687 assert(0);
1688 *fmt++ = 'l';
1689#endif
1690 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001691 else if (size_tflag) {
1692 char *f = PY_FORMAT_SIZE_T;
1693 while (*f)
1694 *fmt++ = *f++;
1695 }
1696 *fmt++ = c;
1697 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001698}
1699
Victor Stinner96865452011-03-01 23:44:09 +00001700/* helper for PyUnicode_FromFormatV() */
1701
1702static const char*
1703parse_format_flags(const char *f,
1704 int *p_width, int *p_precision,
1705 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1706{
1707 int width, precision, longflag, longlongflag, size_tflag;
1708
1709 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1710 f++;
1711 width = 0;
1712 while (Py_ISDIGIT((unsigned)*f))
1713 width = (width*10) + *f++ - '0';
1714 precision = 0;
1715 if (*f == '.') {
1716 f++;
1717 while (Py_ISDIGIT((unsigned)*f))
1718 precision = (precision*10) + *f++ - '0';
1719 if (*f == '%') {
1720 /* "%.3%s" => f points to "3" */
1721 f--;
1722 }
1723 }
1724 if (*f == '\0') {
1725 /* bogus format "%.1" => go backward, f points to "1" */
1726 f--;
1727 }
1728 if (p_width != NULL)
1729 *p_width = width;
1730 if (p_precision != NULL)
1731 *p_precision = precision;
1732
1733 /* Handle %ld, %lu, %lld and %llu. */
1734 longflag = 0;
1735 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001736 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001737
1738 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001739 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001740 longflag = 1;
1741 ++f;
1742 }
1743#ifdef HAVE_LONG_LONG
1744 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001745 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001746 longlongflag = 1;
1747 f += 2;
1748 }
1749#endif
1750 }
1751 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001752 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001753 size_tflag = 1;
1754 ++f;
1755 }
1756 if (p_longflag != NULL)
1757 *p_longflag = longflag;
1758 if (p_longlongflag != NULL)
1759 *p_longlongflag = longlongflag;
1760 if (p_size_tflag != NULL)
1761 *p_size_tflag = size_tflag;
1762 return f;
1763}
1764
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001765/* maximum number of characters required for output of %ld. 21 characters
1766 allows for 64-bit integers (in decimal) and an optional sign. */
1767#define MAX_LONG_CHARS 21
1768/* maximum number of characters required for output of %lld.
1769 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1770 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1771#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1772
Walter Dörwaldd2034312007-05-18 16:29:38 +00001773PyObject *
1774PyUnicode_FromFormatV(const char *format, va_list vargs)
1775{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001776 va_list count;
1777 Py_ssize_t callcount = 0;
1778 PyObject **callresults = NULL;
1779 PyObject **callresult = NULL;
1780 Py_ssize_t n = 0;
1781 int width = 0;
1782 int precision = 0;
1783 int zeropad;
1784 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001786 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001787 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1789 Py_UCS4 argmaxchar;
1790 Py_ssize_t numbersize = 0;
1791 char *numberresults = NULL;
1792 char *numberresult = NULL;
1793 Py_ssize_t i;
1794 int kind;
1795 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001796
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001797 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001798 /* step 1: count the number of %S/%R/%A/%s format specifications
1799 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1800 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801 * result in an array)
1802 * also esimate a upper bound for all the number formats in the string,
1803 * numbers will be formated in step 3 and be keept in a '\0'-separated
1804 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001805 for (f = format; *f; f++) {
1806 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001807 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1809 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1810 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1811 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001814#ifdef HAVE_LONG_LONG
1815 if (longlongflag) {
1816 if (width < MAX_LONG_LONG_CHARS)
1817 width = MAX_LONG_LONG_CHARS;
1818 }
1819 else
1820#endif
1821 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1822 including sign. Decimal takes the most space. This
1823 isn't enough for octal. If a width is specified we
1824 need more (which we allocate later). */
1825 if (width < MAX_LONG_CHARS)
1826 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827
1828 /* account for the size + '\0' to separate numbers
1829 inside of the numberresults buffer */
1830 numbersize += (width + 1);
1831 }
1832 }
1833 else if ((unsigned char)*f > 127) {
1834 PyErr_Format(PyExc_ValueError,
1835 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1836 "string, got a non-ASCII byte: 0x%02x",
1837 (unsigned char)*f);
1838 return NULL;
1839 }
1840 }
1841 /* step 2: allocate memory for the results of
1842 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1843 if (callcount) {
1844 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1845 if (!callresults) {
1846 PyErr_NoMemory();
1847 return NULL;
1848 }
1849 callresult = callresults;
1850 }
1851 /* step 2.5: allocate memory for the results of formating numbers */
1852 if (numbersize) {
1853 numberresults = PyObject_Malloc(numbersize);
1854 if (!numberresults) {
1855 PyErr_NoMemory();
1856 goto fail;
1857 }
1858 numberresult = numberresults;
1859 }
1860
1861 /* step 3: format numbers and figure out how large a buffer we need */
1862 for (f = format; *f; f++) {
1863 if (*f == '%') {
1864 const char* p;
1865 int longflag;
1866 int longlongflag;
1867 int size_tflag;
1868 int numprinted;
1869
1870 p = f;
1871 zeropad = (f[1] == '0');
1872 f = parse_format_flags(f, &width, &precision,
1873 &longflag, &longlongflag, &size_tflag);
1874 switch (*f) {
1875 case 'c':
1876 {
1877 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001878 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001879 n++;
1880 break;
1881 }
1882 case '%':
1883 n++;
1884 break;
1885 case 'i':
1886 case 'd':
1887 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1888 width, precision, *f);
1889 if (longflag)
1890 numprinted = sprintf(numberresult, fmt,
1891 va_arg(count, long));
1892#ifdef HAVE_LONG_LONG
1893 else if (longlongflag)
1894 numprinted = sprintf(numberresult, fmt,
1895 va_arg(count, PY_LONG_LONG));
1896#endif
1897 else if (size_tflag)
1898 numprinted = sprintf(numberresult, fmt,
1899 va_arg(count, Py_ssize_t));
1900 else
1901 numprinted = sprintf(numberresult, fmt,
1902 va_arg(count, int));
1903 n += numprinted;
1904 /* advance by +1 to skip over the '\0' */
1905 numberresult += (numprinted + 1);
1906 assert(*(numberresult - 1) == '\0');
1907 assert(*(numberresult - 2) != '\0');
1908 assert(numprinted >= 0);
1909 assert(numberresult <= numberresults + numbersize);
1910 break;
1911 case 'u':
1912 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1913 width, precision, 'u');
1914 if (longflag)
1915 numprinted = sprintf(numberresult, fmt,
1916 va_arg(count, unsigned long));
1917#ifdef HAVE_LONG_LONG
1918 else if (longlongflag)
1919 numprinted = sprintf(numberresult, fmt,
1920 va_arg(count, unsigned PY_LONG_LONG));
1921#endif
1922 else if (size_tflag)
1923 numprinted = sprintf(numberresult, fmt,
1924 va_arg(count, size_t));
1925 else
1926 numprinted = sprintf(numberresult, fmt,
1927 va_arg(count, unsigned int));
1928 n += numprinted;
1929 numberresult += (numprinted + 1);
1930 assert(*(numberresult - 1) == '\0');
1931 assert(*(numberresult - 2) != '\0');
1932 assert(numprinted >= 0);
1933 assert(numberresult <= numberresults + numbersize);
1934 break;
1935 case 'x':
1936 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1937 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
1938 n += numprinted;
1939 numberresult += (numprinted + 1);
1940 assert(*(numberresult - 1) == '\0');
1941 assert(*(numberresult - 2) != '\0');
1942 assert(numprinted >= 0);
1943 assert(numberresult <= numberresults + numbersize);
1944 break;
1945 case 'p':
1946 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
1947 /* %p is ill-defined: ensure leading 0x. */
1948 if (numberresult[1] == 'X')
1949 numberresult[1] = 'x';
1950 else if (numberresult[1] != 'x') {
1951 memmove(numberresult + 2, numberresult,
1952 strlen(numberresult) + 1);
1953 numberresult[0] = '0';
1954 numberresult[1] = 'x';
1955 numprinted += 2;
1956 }
1957 n += numprinted;
1958 numberresult += (numprinted + 1);
1959 assert(*(numberresult - 1) == '\0');
1960 assert(*(numberresult - 2) != '\0');
1961 assert(numprinted >= 0);
1962 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001963 break;
1964 case 's':
1965 {
1966 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00001967 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001968 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
1969 if (!str)
1970 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001971 /* since PyUnicode_DecodeUTF8 returns already flexible
1972 unicode objects, there is no need to call ready on them */
1973 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001974 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001976 /* Remember the str and switch to the next slot */
1977 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001978 break;
1979 }
1980 case 'U':
1981 {
1982 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02001983 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 if (PyUnicode_READY(obj) == -1)
1985 goto fail;
1986 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02001987 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001989 break;
1990 }
1991 case 'V':
1992 {
1993 PyObject *obj = va_arg(count, PyObject *);
1994 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001995 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001996 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02001997 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00001998 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 if (PyUnicode_READY(obj) == -1)
2000 goto fail;
2001 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002002 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002004 *callresult++ = NULL;
2005 }
2006 else {
2007 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2008 if (!str_obj)
2009 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002011 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002013 *callresult++ = str_obj;
2014 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002015 break;
2016 }
2017 case 'S':
2018 {
2019 PyObject *obj = va_arg(count, PyObject *);
2020 PyObject *str;
2021 assert(obj);
2022 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002024 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002026 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002027 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002028 /* Remember the str and switch to the next slot */
2029 *callresult++ = str;
2030 break;
2031 }
2032 case 'R':
2033 {
2034 PyObject *obj = va_arg(count, PyObject *);
2035 PyObject *repr;
2036 assert(obj);
2037 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002039 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002041 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002043 /* Remember the repr and switch to the next slot */
2044 *callresult++ = repr;
2045 break;
2046 }
2047 case 'A':
2048 {
2049 PyObject *obj = va_arg(count, PyObject *);
2050 PyObject *ascii;
2051 assert(obj);
2052 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002054 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002056 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002058 /* Remember the repr and switch to the next slot */
2059 *callresult++ = ascii;
2060 break;
2061 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002062 default:
2063 /* if we stumble upon an unknown
2064 formatting code, copy the rest of
2065 the format string to the output
2066 string. (we cannot just skip the
2067 code, since there's no way to know
2068 what's in the argument list) */
2069 n += strlen(p);
2070 goto expand;
2071 }
2072 } else
2073 n++;
2074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002075 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002076 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002077 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002078 we don't have to resize the string.
2079 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002080 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002081 if (!string)
2082 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002083 kind = PyUnicode_KIND(string);
2084 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002085 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002086 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002090 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002091
2092 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002093 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2094 /* checking for == because the last argument could be a empty
2095 string, which causes i to point to end, the assert at the end of
2096 the loop */
2097 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002098
Benjamin Peterson14339b62009-01-31 16:36:08 +00002099 switch (*f) {
2100 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002101 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 const int ordinal = va_arg(vargs, int);
2103 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002104 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002105 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002106 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002107 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002108 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002109 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 case 'p':
2111 /* unused, since we already have the result */
2112 if (*f == 'p')
2113 (void) va_arg(vargs, void *);
2114 else
2115 (void) va_arg(vargs, int);
2116 /* extract the result from numberresults and append. */
2117 for (; *numberresult; ++i, ++numberresult)
2118 PyUnicode_WRITE(kind, data, i, *numberresult);
2119 /* skip over the separating '\0' */
2120 assert(*numberresult == '\0');
2121 numberresult++;
2122 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002123 break;
2124 case 's':
2125 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002126 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002128 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 size = PyUnicode_GET_LENGTH(*callresult);
2130 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002131 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2132 *callresult, 0,
2133 size) < 0)
2134 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002136 /* We're done with the unicode()/repr() => forget it */
2137 Py_DECREF(*callresult);
2138 /* switch to next unicode()/repr() result */
2139 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002140 break;
2141 }
2142 case 'U':
2143 {
2144 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002145 Py_ssize_t size;
2146 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2147 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002148 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2149 obj, 0,
2150 size) < 0)
2151 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002152 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002153 break;
2154 }
2155 case 'V':
2156 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002158 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002159 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002160 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002161 size = PyUnicode_GET_LENGTH(obj);
2162 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002163 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2164 obj, 0,
2165 size) < 0)
2166 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 size = PyUnicode_GET_LENGTH(*callresult);
2170 assert(PyUnicode_KIND(*callresult) <=
2171 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002172 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2173 *callresult,
2174 0, size) < 0)
2175 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002176 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002177 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002179 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002180 break;
2181 }
2182 case 'S':
2183 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002184 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002186 /* unused, since we already have the result */
2187 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002189 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2190 *callresult, 0,
2191 PyUnicode_GET_LENGTH(*callresult)) < 0)
2192 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002194 /* We're done with the unicode()/repr() => forget it */
2195 Py_DECREF(*callresult);
2196 /* switch to next unicode()/repr() result */
2197 ++callresult;
2198 break;
2199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002200 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002202 break;
2203 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002204 for (; *p; ++p, ++i)
2205 PyUnicode_WRITE(kind, data, i, *p);
2206 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 goto end;
2208 }
Victor Stinner1205f272010-09-11 00:54:47 +00002209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 else {
2211 assert(i < PyUnicode_GET_LENGTH(string));
2212 PyUnicode_WRITE(kind, data, i++, *f);
2213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002216
Benjamin Peterson29060642009-01-31 22:14:21 +00002217 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 if (callresults)
2219 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 if (numberresults)
2221 PyObject_Free(numberresults);
2222 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002223 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002224 if (callresults) {
2225 PyObject **callresult2 = callresults;
2226 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002227 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002228 ++callresult2;
2229 }
2230 PyObject_Free(callresults);
2231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 if (numberresults)
2233 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002234 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002235}
2236
Walter Dörwaldd2034312007-05-18 16:29:38 +00002237PyObject *
2238PyUnicode_FromFormat(const char *format, ...)
2239{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002240 PyObject* ret;
2241 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002242
2243#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002244 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002245#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002246 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002247#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002248 ret = PyUnicode_FromFormatV(format, vargs);
2249 va_end(vargs);
2250 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002251}
2252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253#ifdef HAVE_WCHAR_H
2254
Victor Stinner5593d8a2010-10-02 11:11:27 +00002255/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2256 convert a Unicode object to a wide character string.
2257
Victor Stinnerd88d9832011-09-06 02:00:05 +02002258 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002259 character) required to convert the unicode object. Ignore size argument.
2260
Victor Stinnerd88d9832011-09-06 02:00:05 +02002261 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002262 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002263 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002264static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002265unicode_aswidechar(PyUnicodeObject *unicode,
2266 wchar_t *w,
2267 Py_ssize_t size)
2268{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002269 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 const wchar_t *wstr;
2271
2272 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2273 if (wstr == NULL)
2274 return -1;
2275
Victor Stinner5593d8a2010-10-02 11:11:27 +00002276 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002277 if (size > res)
2278 size = res + 1;
2279 else
2280 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002282 return res;
2283 }
2284 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002286}
2287
2288Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002289PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002290 wchar_t *w,
2291 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002292{
2293 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002294 PyErr_BadInternalCall();
2295 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002296 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002297 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298}
2299
Victor Stinner137c34c2010-09-29 10:25:54 +00002300wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002301PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002302 Py_ssize_t *size)
2303{
2304 wchar_t* buffer;
2305 Py_ssize_t buflen;
2306
2307 if (unicode == NULL) {
2308 PyErr_BadInternalCall();
2309 return NULL;
2310 }
2311
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002312 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (buflen == -1)
2314 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002315 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002316 PyErr_NoMemory();
2317 return NULL;
2318 }
2319
Victor Stinner137c34c2010-09-29 10:25:54 +00002320 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2321 if (buffer == NULL) {
2322 PyErr_NoMemory();
2323 return NULL;
2324 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002325 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002326 if (buflen == -1)
2327 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002328 if (size != NULL)
2329 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002330 return buffer;
2331}
2332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002334
Alexander Belopolsky40018472011-02-26 01:02:56 +00002335PyObject *
2336PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002339 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002340 PyErr_SetString(PyExc_ValueError,
2341 "chr() arg not in range(0x110000)");
2342 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002343 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 if (ordinal < 256)
2346 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 v = PyUnicode_New(1, ordinal);
2349 if (v == NULL)
2350 return NULL;
2351 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2352 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002353}
2354
Alexander Belopolsky40018472011-02-26 01:02:56 +00002355PyObject *
2356PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002358 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002359 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002360 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002361 if (PyUnicode_READY(obj))
2362 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002363 Py_INCREF(obj);
2364 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002365 }
2366 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002367 /* For a Unicode subtype that's not a Unicode object,
2368 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002369 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002370 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002371 PyErr_Format(PyExc_TypeError,
2372 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002373 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002374 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002375}
2376
Alexander Belopolsky40018472011-02-26 01:02:56 +00002377PyObject *
2378PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002379 const char *encoding,
2380 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002381{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002382 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002383 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002384
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002386 PyErr_BadInternalCall();
2387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002388 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002389
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002390 /* Decoding bytes objects is the most common case and should be fast */
2391 if (PyBytes_Check(obj)) {
2392 if (PyBytes_GET_SIZE(obj) == 0) {
2393 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002394 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002395 }
2396 else {
2397 v = PyUnicode_Decode(
2398 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2399 encoding, errors);
2400 }
2401 return v;
2402 }
2403
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002404 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002405 PyErr_SetString(PyExc_TypeError,
2406 "decoding str is not supported");
2407 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002409
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002410 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2411 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2412 PyErr_Format(PyExc_TypeError,
2413 "coercing to str: need bytes, bytearray "
2414 "or buffer-like object, %.80s found",
2415 Py_TYPE(obj)->tp_name);
2416 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002417 }
Tim Petersced69f82003-09-16 20:30:58 +00002418
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002419 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002420 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002421 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422 }
Tim Petersced69f82003-09-16 20:30:58 +00002423 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002424 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002425
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002426 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002427 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002428}
2429
Victor Stinner600d3be2010-06-10 12:00:55 +00002430/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002431 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2432 1 on success. */
2433static int
2434normalize_encoding(const char *encoding,
2435 char *lower,
2436 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002438 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002439 char *l;
2440 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002441
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002442 e = encoding;
2443 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002444 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002445 while (*e) {
2446 if (l == l_end)
2447 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002448 if (Py_ISUPPER(*e)) {
2449 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002450 }
2451 else if (*e == '_') {
2452 *l++ = '-';
2453 e++;
2454 }
2455 else {
2456 *l++ = *e++;
2457 }
2458 }
2459 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002460 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002461}
2462
Alexander Belopolsky40018472011-02-26 01:02:56 +00002463PyObject *
2464PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002465 Py_ssize_t size,
2466 const char *encoding,
2467 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002468{
2469 PyObject *buffer = NULL, *unicode;
2470 Py_buffer info;
2471 char lower[11]; /* Enough for any encoding shortcut */
2472
2473 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002474 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002475
2476 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002477 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002478 if ((strcmp(lower, "utf-8") == 0) ||
2479 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002480 return PyUnicode_DecodeUTF8(s, size, errors);
2481 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002482 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002483 (strcmp(lower, "iso-8859-1") == 0))
2484 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002485#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002486 else if (strcmp(lower, "mbcs") == 0)
2487 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002488#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002489 else if (strcmp(lower, "ascii") == 0)
2490 return PyUnicode_DecodeASCII(s, size, errors);
2491 else if (strcmp(lower, "utf-16") == 0)
2492 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2493 else if (strcmp(lower, "utf-32") == 0)
2494 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496
2497 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002498 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002499 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002500 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002501 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 if (buffer == NULL)
2503 goto onError;
2504 unicode = PyCodec_Decode(buffer, encoding, errors);
2505 if (unicode == NULL)
2506 goto onError;
2507 if (!PyUnicode_Check(unicode)) {
2508 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002509 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002510 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 Py_DECREF(unicode);
2512 goto onError;
2513 }
2514 Py_DECREF(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 if (PyUnicode_READY(unicode)) {
2516 Py_DECREF(unicode);
2517 return NULL;
2518 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Benjamin Peterson29060642009-01-31 22:14:21 +00002521 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 Py_XDECREF(buffer);
2523 return NULL;
2524}
2525
Alexander Belopolsky40018472011-02-26 01:02:56 +00002526PyObject *
2527PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002528 const char *encoding,
2529 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002530{
2531 PyObject *v;
2532
2533 if (!PyUnicode_Check(unicode)) {
2534 PyErr_BadArgument();
2535 goto onError;
2536 }
2537
2538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002540
2541 /* Decode via the codec registry */
2542 v = PyCodec_Decode(unicode, encoding, errors);
2543 if (v == NULL)
2544 goto onError;
2545 return v;
2546
Benjamin Peterson29060642009-01-31 22:14:21 +00002547 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002548 return NULL;
2549}
2550
Alexander Belopolsky40018472011-02-26 01:02:56 +00002551PyObject *
2552PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002553 const char *encoding,
2554 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002555{
2556 PyObject *v;
2557
2558 if (!PyUnicode_Check(unicode)) {
2559 PyErr_BadArgument();
2560 goto onError;
2561 }
2562
2563 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002564 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002565
2566 /* Decode via the codec registry */
2567 v = PyCodec_Decode(unicode, encoding, errors);
2568 if (v == NULL)
2569 goto onError;
2570 if (!PyUnicode_Check(v)) {
2571 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002572 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002573 Py_TYPE(v)->tp_name);
2574 Py_DECREF(v);
2575 goto onError;
2576 }
2577 return v;
2578
Benjamin Peterson29060642009-01-31 22:14:21 +00002579 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002580 return NULL;
2581}
2582
Alexander Belopolsky40018472011-02-26 01:02:56 +00002583PyObject *
2584PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002585 Py_ssize_t size,
2586 const char *encoding,
2587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588{
2589 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002590
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 unicode = PyUnicode_FromUnicode(s, size);
2592 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2595 Py_DECREF(unicode);
2596 return v;
2597}
2598
Alexander Belopolsky40018472011-02-26 01:02:56 +00002599PyObject *
2600PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002601 const char *encoding,
2602 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002603{
2604 PyObject *v;
2605
2606 if (!PyUnicode_Check(unicode)) {
2607 PyErr_BadArgument();
2608 goto onError;
2609 }
2610
2611 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002613
2614 /* Encode via the codec registry */
2615 v = PyCodec_Encode(unicode, encoding, errors);
2616 if (v == NULL)
2617 goto onError;
2618 return v;
2619
Benjamin Peterson29060642009-01-31 22:14:21 +00002620 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002621 return NULL;
2622}
2623
Victor Stinnerad158722010-10-27 00:25:46 +00002624PyObject *
2625PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002626{
Victor Stinner99b95382011-07-04 14:23:54 +02002627#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002628 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2629 PyUnicode_GET_SIZE(unicode),
2630 NULL);
2631#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002633#else
Victor Stinner793b5312011-04-27 00:24:21 +02002634 PyInterpreterState *interp = PyThreadState_GET()->interp;
2635 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2636 cannot use it to encode and decode filenames before it is loaded. Load
2637 the Python codec requires to encode at least its own filename. Use the C
2638 version of the locale codec until the codec registry is initialized and
2639 the Python codec is loaded.
2640
2641 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2642 cannot only rely on it: check also interp->fscodec_initialized for
2643 subinterpreters. */
2644 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002645 return PyUnicode_AsEncodedString(unicode,
2646 Py_FileSystemDefaultEncoding,
2647 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002648 }
2649 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002650 /* locale encoding with surrogateescape */
2651 wchar_t *wchar;
2652 char *bytes;
2653 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002654 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002655
2656 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2657 if (wchar == NULL)
2658 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002659 bytes = _Py_wchar2char(wchar, &error_pos);
2660 if (bytes == NULL) {
2661 if (error_pos != (size_t)-1) {
2662 char *errmsg = strerror(errno);
2663 PyObject *exc = NULL;
2664 if (errmsg == NULL)
2665 errmsg = "Py_wchar2char() failed";
2666 raise_encode_exception(&exc,
2667 "filesystemencoding",
2668 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2669 error_pos, error_pos+1,
2670 errmsg);
2671 Py_XDECREF(exc);
2672 }
2673 else
2674 PyErr_NoMemory();
2675 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002676 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002677 }
2678 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002679
2680 bytes_obj = PyBytes_FromString(bytes);
2681 PyMem_Free(bytes);
2682 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002683 }
Victor Stinnerad158722010-10-27 00:25:46 +00002684#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002685}
2686
Alexander Belopolsky40018472011-02-26 01:02:56 +00002687PyObject *
2688PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002689 const char *encoding,
2690 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691{
2692 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002693 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002694
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 if (!PyUnicode_Check(unicode)) {
2696 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
Fred Drakee4315f52000-05-09 19:53:39 +00002699
Victor Stinner2f283c22011-03-02 01:21:46 +00002700 if (encoding == NULL) {
2701 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002703 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002705 }
Fred Drakee4315f52000-05-09 19:53:39 +00002706
2707 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002708 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002709 if ((strcmp(lower, "utf-8") == 0) ||
2710 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002711 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002712 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002714 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002716 }
Victor Stinner37296e82010-06-10 13:36:23 +00002717 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002718 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002719 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002721#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002722 else if (strcmp(lower, "mbcs") == 0)
2723 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2724 PyUnicode_GET_SIZE(unicode),
2725 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002726#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002727 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730
2731 /* Encode via the codec registry */
2732 v = PyCodec_Encode(unicode, encoding, errors);
2733 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002734 return NULL;
2735
2736 /* The normal path */
2737 if (PyBytes_Check(v))
2738 return v;
2739
2740 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002741 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002742 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002743 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002744
2745 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2746 "encoder %s returned bytearray instead of bytes",
2747 encoding);
2748 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002749 Py_DECREF(v);
2750 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002751 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002752
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002753 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2754 Py_DECREF(v);
2755 return b;
2756 }
2757
2758 PyErr_Format(PyExc_TypeError,
2759 "encoder did not return a bytes object (type=%.400s)",
2760 Py_TYPE(v)->tp_name);
2761 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002762 return NULL;
2763}
2764
Alexander Belopolsky40018472011-02-26 01:02:56 +00002765PyObject *
2766PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002767 const char *encoding,
2768 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002769{
2770 PyObject *v;
2771
2772 if (!PyUnicode_Check(unicode)) {
2773 PyErr_BadArgument();
2774 goto onError;
2775 }
2776
2777 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002778 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002779
2780 /* Encode via the codec registry */
2781 v = PyCodec_Encode(unicode, encoding, errors);
2782 if (v == NULL)
2783 goto onError;
2784 if (!PyUnicode_Check(v)) {
2785 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002786 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002787 Py_TYPE(v)->tp_name);
2788 Py_DECREF(v);
2789 goto onError;
2790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002792
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 return NULL;
2795}
2796
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002797PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002798PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002799 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00002800 return PyUnicode_DecodeFSDefaultAndSize(s, size);
2801}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002802
Christian Heimes5894ba72007-11-04 11:43:14 +00002803PyObject*
2804PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2805{
Victor Stinner99b95382011-07-04 14:23:54 +02002806#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002807 return PyUnicode_DecodeMBCS(s, size, NULL);
2808#elif defined(__APPLE__)
2809 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2810#else
Victor Stinner793b5312011-04-27 00:24:21 +02002811 PyInterpreterState *interp = PyThreadState_GET()->interp;
2812 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2813 cannot use it to encode and decode filenames before it is loaded. Load
2814 the Python codec requires to encode at least its own filename. Use the C
2815 version of the locale codec until the codec registry is initialized and
2816 the Python codec is loaded.
2817
2818 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2819 cannot only rely on it: check also interp->fscodec_initialized for
2820 subinterpreters. */
2821 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002822 return PyUnicode_Decode(s, size,
2823 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00002824 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002825 }
2826 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002827 /* locale encoding with surrogateescape */
2828 wchar_t *wchar;
2829 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00002830 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002831
2832 if (s[size] != '\0' || size != strlen(s)) {
2833 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2834 return NULL;
2835 }
2836
Victor Stinner168e1172010-10-16 23:16:16 +00002837 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002838 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00002839 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002840
Victor Stinner168e1172010-10-16 23:16:16 +00002841 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002842 PyMem_Free(wchar);
2843 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002844 }
Victor Stinnerad158722010-10-27 00:25:46 +00002845#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002846}
2847
Martin v. Löwis011e8422009-05-05 04:43:17 +00002848
2849int
2850PyUnicode_FSConverter(PyObject* arg, void* addr)
2851{
2852 PyObject *output = NULL;
2853 Py_ssize_t size;
2854 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002855 if (arg == NULL) {
2856 Py_DECREF(*(PyObject**)addr);
2857 return 1;
2858 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00002859 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00002860 output = arg;
2861 Py_INCREF(output);
2862 }
2863 else {
2864 arg = PyUnicode_FromObject(arg);
2865 if (!arg)
2866 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00002867 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002868 Py_DECREF(arg);
2869 if (!output)
2870 return 0;
2871 if (!PyBytes_Check(output)) {
2872 Py_DECREF(output);
2873 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2874 return 0;
2875 }
2876 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00002877 size = PyBytes_GET_SIZE(output);
2878 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00002879 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05002880 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00002881 Py_DECREF(output);
2882 return 0;
2883 }
2884 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00002885 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00002886}
2887
2888
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002889int
2890PyUnicode_FSDecoder(PyObject* arg, void* addr)
2891{
2892 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002893 if (arg == NULL) {
2894 Py_DECREF(*(PyObject**)addr);
2895 return 1;
2896 }
2897 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002898 if (PyUnicode_READY(arg))
2899 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002900 output = arg;
2901 Py_INCREF(output);
2902 }
2903 else {
2904 arg = PyBytes_FromObject(arg);
2905 if (!arg)
2906 return 0;
2907 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2908 PyBytes_GET_SIZE(arg));
2909 Py_DECREF(arg);
2910 if (!output)
2911 return 0;
2912 if (!PyUnicode_Check(output)) {
2913 Py_DECREF(output);
2914 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
2915 return 0;
2916 }
2917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002918 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
2919 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00002920 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2921 Py_DECREF(output);
2922 return 0;
2923 }
2924 *(PyObject**)addr = output;
2925 return Py_CLEANUP_SUPPORTED;
2926}
2927
2928
Martin v. Löwis5b222132007-06-10 09:51:05 +00002929char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002930PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002931{
Christian Heimesf3863112007-11-22 07:46:41 +00002932 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
2934
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00002935 if (!PyUnicode_Check(unicode)) {
2936 PyErr_BadArgument();
2937 return NULL;
2938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002939 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00002940 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002941
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002942 if (PyUnicode_UTF8(unicode) == NULL) {
2943 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002944 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
2945 if (bytes == NULL)
2946 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002947 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
2948 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002949 Py_DECREF(bytes);
2950 return NULL;
2951 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002952 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
2953 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 Py_DECREF(bytes);
2955 }
2956
2957 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02002958 *psize = PyUnicode_UTF8_LENGTH(unicode);
2959 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002960}
2961
2962char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00002964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965 return PyUnicode_AsUTF8AndSize(unicode, NULL);
2966}
2967
2968#ifdef Py_DEBUG
2969int unicode_as_unicode_calls = 0;
2970#endif
2971
2972
2973Py_UNICODE *
2974PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
2975{
2976 PyUnicodeObject *u;
2977 const unsigned char *one_byte;
2978#if SIZEOF_WCHAR_T == 4
2979 const Py_UCS2 *two_bytes;
2980#else
2981 const Py_UCS4 *four_bytes;
2982 const Py_UCS4 *ucs4_end;
2983 Py_ssize_t num_surrogates;
2984#endif
2985 wchar_t *w;
2986 wchar_t *wchar_end;
2987
2988 if (!PyUnicode_Check(unicode)) {
2989 PyErr_BadArgument();
2990 return NULL;
2991 }
2992 u = (PyUnicodeObject*)unicode;
2993 if (_PyUnicode_WSTR(u) == NULL) {
2994 /* Non-ASCII compact unicode object */
2995 assert(_PyUnicode_KIND(u) != 0);
2996 assert(PyUnicode_IS_READY(u));
2997
2998#ifdef Py_DEBUG
2999 ++unicode_as_unicode_calls;
3000#endif
3001
3002 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3003#if SIZEOF_WCHAR_T == 2
3004 four_bytes = PyUnicode_4BYTE_DATA(u);
3005 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3006 num_surrogates = 0;
3007
3008 for (; four_bytes < ucs4_end; ++four_bytes) {
3009 if (*four_bytes > 0xFFFF)
3010 ++num_surrogates;
3011 }
3012
3013 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3014 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3015 if (!_PyUnicode_WSTR(u)) {
3016 PyErr_NoMemory();
3017 return NULL;
3018 }
3019 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3020
3021 w = _PyUnicode_WSTR(u);
3022 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3023 four_bytes = PyUnicode_4BYTE_DATA(u);
3024 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3025 if (*four_bytes > 0xFFFF) {
3026 /* encode surrogate pair in this case */
3027 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3028 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3029 }
3030 else
3031 *w = *four_bytes;
3032
3033 if (w > wchar_end) {
3034 assert(0 && "Miscalculated string end");
3035 }
3036 }
3037 *w = 0;
3038#else
3039 /* sizeof(wchar_t) == 4 */
3040 Py_FatalError("Impossible unicode object state, wstr and str "
3041 "should share memory already.");
3042 return NULL;
3043#endif
3044 }
3045 else {
3046 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3047 (_PyUnicode_LENGTH(u) + 1));
3048 if (!_PyUnicode_WSTR(u)) {
3049 PyErr_NoMemory();
3050 return NULL;
3051 }
3052 if (!PyUnicode_IS_COMPACT_ASCII(u))
3053 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3054 w = _PyUnicode_WSTR(u);
3055 wchar_end = w + _PyUnicode_LENGTH(u);
3056
3057 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3058 one_byte = PyUnicode_1BYTE_DATA(u);
3059 for (; w < wchar_end; ++one_byte, ++w)
3060 *w = *one_byte;
3061 /* null-terminate the wstr */
3062 *w = 0;
3063 }
3064 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3065#if SIZEOF_WCHAR_T == 4
3066 two_bytes = PyUnicode_2BYTE_DATA(u);
3067 for (; w < wchar_end; ++two_bytes, ++w)
3068 *w = *two_bytes;
3069 /* null-terminate the wstr */
3070 *w = 0;
3071#else
3072 /* sizeof(wchar_t) == 2 */
3073 PyObject_FREE(_PyUnicode_WSTR(u));
3074 _PyUnicode_WSTR(u) = NULL;
3075 Py_FatalError("Impossible unicode object state, wstr "
3076 "and str should share memory already.");
3077 return NULL;
3078#endif
3079 }
3080 else {
3081 assert(0 && "This should never happen.");
3082 }
3083 }
3084 }
3085 if (size != NULL)
3086 *size = PyUnicode_WSTR_LENGTH(u);
3087 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090Py_UNICODE *
3091PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003093 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094}
3095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097Py_ssize_t
3098PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099{
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 goto onError;
3103 }
3104 return PyUnicode_GET_SIZE(unicode);
3105
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 return -1;
3108}
3109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110Py_ssize_t
3111PyUnicode_GetLength(PyObject *unicode)
3112{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003113 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003114 PyErr_BadArgument();
3115 return -1;
3116 }
3117
3118 return PyUnicode_GET_LENGTH(unicode);
3119}
3120
3121Py_UCS4
3122PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3123{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003124 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3125 PyErr_BadArgument();
3126 return (Py_UCS4)-1;
3127 }
3128 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3129 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003130 return (Py_UCS4)-1;
3131 }
3132 return PyUnicode_READ_CHAR(unicode, index);
3133}
3134
3135int
3136PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3137{
3138 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003139 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003140 return -1;
3141 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003142 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3143 PyErr_SetString(PyExc_IndexError, "string index out of range");
3144 return -1;
3145 }
3146 if (_PyUnicode_Dirty(unicode))
3147 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003148 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3149 index, ch);
3150 return 0;
3151}
3152
Alexander Belopolsky40018472011-02-26 01:02:56 +00003153const char *
3154PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003155{
Victor Stinner42cb4622010-09-01 19:39:01 +00003156 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003157}
3158
Victor Stinner554f3f02010-06-16 23:33:54 +00003159/* create or adjust a UnicodeDecodeError */
3160static void
3161make_decode_exception(PyObject **exceptionObject,
3162 const char *encoding,
3163 const char *input, Py_ssize_t length,
3164 Py_ssize_t startpos, Py_ssize_t endpos,
3165 const char *reason)
3166{
3167 if (*exceptionObject == NULL) {
3168 *exceptionObject = PyUnicodeDecodeError_Create(
3169 encoding, input, length, startpos, endpos, reason);
3170 }
3171 else {
3172 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3173 goto onError;
3174 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3175 goto onError;
3176 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3177 goto onError;
3178 }
3179 return;
3180
3181onError:
3182 Py_DECREF(*exceptionObject);
3183 *exceptionObject = NULL;
3184}
3185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003186/* error handling callback helper:
3187 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003188 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003189 and adjust various state variables.
3190 return 0 on success, -1 on error
3191*/
3192
Alexander Belopolsky40018472011-02-26 01:02:56 +00003193static int
3194unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003195 const char *encoding, const char *reason,
3196 const char **input, const char **inend, Py_ssize_t *startinpos,
3197 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3198 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003199{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003200 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003201
3202 PyObject *restuple = NULL;
3203 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003204 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003205 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003206 Py_ssize_t requiredsize;
3207 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003208 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003209 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003210 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003211 int res = -1;
3212
3213 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 *errorHandler = PyCodec_LookupError(errors);
3215 if (*errorHandler == NULL)
3216 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003217 }
3218
Victor Stinner554f3f02010-06-16 23:33:54 +00003219 make_decode_exception(exceptionObject,
3220 encoding,
3221 *input, *inend - *input,
3222 *startinpos, *endinpos,
3223 reason);
3224 if (*exceptionObject == NULL)
3225 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226
3227 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3228 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003229 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003231 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 }
3234 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003235 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003236
3237 /* Copy back the bytes variables, which might have been modified by the
3238 callback */
3239 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3240 if (!inputobj)
3241 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003242 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003244 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003245 *input = PyBytes_AS_STRING(inputobj);
3246 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003247 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003248 /* we can DECREF safely, as the exception has another reference,
3249 so the object won't go away. */
3250 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003254 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3256 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003257 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258
3259 /* need more space? (at least enough for what we
3260 have+the replacement+the rest of the string (starting
3261 at the new input position), so we won't have to check space
3262 when there are no errors in the rest of the string) */
3263 repptr = PyUnicode_AS_UNICODE(repunicode);
3264 repsize = PyUnicode_GET_SIZE(repunicode);
3265 requiredsize = *outpos + repsize + insize-newpos;
3266 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 if (requiredsize<2*outsize)
3268 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003269 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 goto onError;
3271 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 }
3273 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003274 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003275 Py_UNICODE_COPY(*outptr, repptr, repsize);
3276 *outptr += repsize;
3277 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003279 /* we made it! */
3280 res = 0;
3281
Benjamin Peterson29060642009-01-31 22:14:21 +00003282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283 Py_XDECREF(restuple);
3284 return res;
3285}
3286
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003287/* --- UTF-7 Codec -------------------------------------------------------- */
3288
Antoine Pitrou244651a2009-05-04 18:56:13 +00003289/* See RFC2152 for details. We encode conservatively and decode liberally. */
3290
3291/* Three simple macros defining base-64. */
3292
3293/* Is c a base-64 character? */
3294
3295#define IS_BASE64(c) \
3296 (((c) >= 'A' && (c) <= 'Z') || \
3297 ((c) >= 'a' && (c) <= 'z') || \
3298 ((c) >= '0' && (c) <= '9') || \
3299 (c) == '+' || (c) == '/')
3300
3301/* given that c is a base-64 character, what is its base-64 value? */
3302
3303#define FROM_BASE64(c) \
3304 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3305 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3306 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3307 (c) == '+' ? 62 : 63)
3308
3309/* What is the base-64 character of the bottom 6 bits of n? */
3310
3311#define TO_BASE64(n) \
3312 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3313
3314/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3315 * decoded as itself. We are permissive on decoding; the only ASCII
3316 * byte not decoding to itself is the + which begins a base64
3317 * string. */
3318
3319#define DECODE_DIRECT(c) \
3320 ((c) <= 127 && (c) != '+')
3321
3322/* The UTF-7 encoder treats ASCII characters differently according to
3323 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3324 * the above). See RFC2152. This array identifies these different
3325 * sets:
3326 * 0 : "Set D"
3327 * alphanumeric and '(),-./:?
3328 * 1 : "Set O"
3329 * !"#$%&*;<=>@[]^_`{|}
3330 * 2 : "whitespace"
3331 * ht nl cr sp
3332 * 3 : special (must be base64 encoded)
3333 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3334 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003335
Tim Petersced69f82003-09-16 20:30:58 +00003336static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003337char utf7_category[128] = {
3338/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3339 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3340/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3341 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3342/* sp ! " # $ % & ' ( ) * + , - . / */
3343 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3344/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3345 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3346/* @ A B C D E F G H I J K L M N O */
3347 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3348/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3349 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3350/* ` a b c d e f g h i j k l m n o */
3351 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3352/* p q r s t u v w x y z { | } ~ del */
3353 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003354};
3355
Antoine Pitrou244651a2009-05-04 18:56:13 +00003356/* ENCODE_DIRECT: this character should be encoded as itself. The
3357 * answer depends on whether we are encoding set O as itself, and also
3358 * on whether we are encoding whitespace as itself. RFC2152 makes it
3359 * clear that the answers to these questions vary between
3360 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003361
Antoine Pitrou244651a2009-05-04 18:56:13 +00003362#define ENCODE_DIRECT(c, directO, directWS) \
3363 ((c) < 128 && (c) > 0 && \
3364 ((utf7_category[(c)] == 0) || \
3365 (directWS && (utf7_category[(c)] == 2)) || \
3366 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003367
Alexander Belopolsky40018472011-02-26 01:02:56 +00003368PyObject *
3369PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003370 Py_ssize_t size,
3371 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003372{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003373 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3374}
3375
Antoine Pitrou244651a2009-05-04 18:56:13 +00003376/* The decoder. The only state we preserve is our read position,
3377 * i.e. how many characters we have consumed. So if we end in the
3378 * middle of a shift sequence we have to back off the read position
3379 * and the output to the beginning of the sequence, otherwise we lose
3380 * all the shift state (seen bits, number of bits seen, high
3381 * surrogate). */
3382
Alexander Belopolsky40018472011-02-26 01:02:56 +00003383PyObject *
3384PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003385 Py_ssize_t size,
3386 const char *errors,
3387 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003390 Py_ssize_t startinpos;
3391 Py_ssize_t endinpos;
3392 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003393 const char *e;
3394 PyUnicodeObject *unicode;
3395 Py_UNICODE *p;
3396 const char *errmsg = "";
3397 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003398 Py_UNICODE *shiftOutStart;
3399 unsigned int base64bits = 0;
3400 unsigned long base64buffer = 0;
3401 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003402 PyObject *errorHandler = NULL;
3403 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003404
3405 unicode = _PyUnicode_New(size);
3406 if (!unicode)
3407 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003408 if (size == 0) {
3409 if (consumed)
3410 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003411 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003412 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003415 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003416 e = s + size;
3417
3418 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003420 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003421 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003422
Antoine Pitrou244651a2009-05-04 18:56:13 +00003423 if (inShift) { /* in a base-64 section */
3424 if (IS_BASE64(ch)) { /* consume a base-64 character */
3425 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3426 base64bits += 6;
3427 s++;
3428 if (base64bits >= 16) {
3429 /* we have enough bits for a UTF-16 value */
3430 Py_UNICODE outCh = (Py_UNICODE)
3431 (base64buffer >> (base64bits-16));
3432 base64bits -= 16;
3433 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3434 if (surrogate) {
3435 /* expecting a second surrogate */
3436 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3437#ifdef Py_UNICODE_WIDE
3438 *p++ = (((surrogate & 0x3FF)<<10)
3439 | (outCh & 0x3FF)) + 0x10000;
3440#else
3441 *p++ = surrogate;
3442 *p++ = outCh;
3443#endif
3444 surrogate = 0;
3445 }
3446 else {
3447 surrogate = 0;
3448 errmsg = "second surrogate missing";
3449 goto utf7Error;
3450 }
3451 }
3452 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3453 /* first surrogate */
3454 surrogate = outCh;
3455 }
3456 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3457 errmsg = "unexpected second surrogate";
3458 goto utf7Error;
3459 }
3460 else {
3461 *p++ = outCh;
3462 }
3463 }
3464 }
3465 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003466 inShift = 0;
3467 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003468 if (surrogate) {
3469 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003470 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003471 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003472 if (base64bits > 0) { /* left-over bits */
3473 if (base64bits >= 6) {
3474 /* We've seen at least one base-64 character */
3475 errmsg = "partial character in shift sequence";
3476 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003477 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003478 else {
3479 /* Some bits remain; they should be zero */
3480 if (base64buffer != 0) {
3481 errmsg = "non-zero padding bits in shift sequence";
3482 goto utf7Error;
3483 }
3484 }
3485 }
3486 if (ch != '-') {
3487 /* '-' is absorbed; other terminating
3488 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003489 *p++ = ch;
3490 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003491 }
3492 }
3493 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003495 s++; /* consume '+' */
3496 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003497 s++;
3498 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003499 }
3500 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003501 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003502 shiftOutStart = p;
3503 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003504 }
3505 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003506 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003507 *p++ = ch;
3508 s++;
3509 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003510 else {
3511 startinpos = s-starts;
3512 s++;
3513 errmsg = "unexpected special character";
3514 goto utf7Error;
3515 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003516 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003517utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518 outpos = p-PyUnicode_AS_UNICODE(unicode);
3519 endinpos = s-starts;
3520 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 errors, &errorHandler,
3522 "utf7", errmsg,
3523 &starts, &e, &startinpos, &endinpos, &exc, &s,
3524 &unicode, &outpos, &p))
3525 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003526 }
3527
Antoine Pitrou244651a2009-05-04 18:56:13 +00003528 /* end of string */
3529
3530 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3531 /* if we're in an inconsistent state, that's an error */
3532 if (surrogate ||
3533 (base64bits >= 6) ||
3534 (base64bits > 0 && base64buffer != 0)) {
3535 outpos = p-PyUnicode_AS_UNICODE(unicode);
3536 endinpos = size;
3537 if (unicode_decode_call_errorhandler(
3538 errors, &errorHandler,
3539 "utf7", "unterminated shift sequence",
3540 &starts, &e, &startinpos, &endinpos, &exc, &s,
3541 &unicode, &outpos, &p))
3542 goto onError;
3543 if (s < e)
3544 goto restart;
3545 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003547
3548 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003549 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003550 if (inShift) {
3551 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003552 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003553 }
3554 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003555 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003556 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003557 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003558
Victor Stinnerfe226c02011-10-03 03:52:20 +02003559 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003560 goto onError;
3561
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562 Py_XDECREF(errorHandler);
3563 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003564 if (PyUnicode_READY(unicode) == -1) {
3565 Py_DECREF(unicode);
3566 return NULL;
3567 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003568 return (PyObject *)unicode;
3569
Benjamin Peterson29060642009-01-31 22:14:21 +00003570 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 Py_XDECREF(errorHandler);
3572 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003573 Py_DECREF(unicode);
3574 return NULL;
3575}
3576
3577
Alexander Belopolsky40018472011-02-26 01:02:56 +00003578PyObject *
3579PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003580 Py_ssize_t size,
3581 int base64SetO,
3582 int base64WhiteSpace,
3583 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003584{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003585 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003586 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003587 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003588 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003589 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003590 unsigned int base64bits = 0;
3591 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003592 char * out;
3593 char * start;
3594
3595 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003597
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003598 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003599 return PyErr_NoMemory();
3600
Antoine Pitrou244651a2009-05-04 18:56:13 +00003601 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003602 if (v == NULL)
3603 return NULL;
3604
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003605 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003606 for (;i < size; ++i) {
3607 Py_UNICODE ch = s[i];
3608
Antoine Pitrou244651a2009-05-04 18:56:13 +00003609 if (inShift) {
3610 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3611 /* shifting out */
3612 if (base64bits) { /* output remaining bits */
3613 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3614 base64buffer = 0;
3615 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 }
3617 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003618 /* Characters not in the BASE64 set implicitly unshift the sequence
3619 so no '-' is required, except if the character is itself a '-' */
3620 if (IS_BASE64(ch) || ch == '-') {
3621 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 *out++ = (char) ch;
3624 }
3625 else {
3626 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003627 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003628 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003629 else { /* not in a shift sequence */
3630 if (ch == '+') {
3631 *out++ = '+';
3632 *out++ = '-';
3633 }
3634 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3635 *out++ = (char) ch;
3636 }
3637 else {
3638 *out++ = '+';
3639 inShift = 1;
3640 goto encode_char;
3641 }
3642 }
3643 continue;
3644encode_char:
3645#ifdef Py_UNICODE_WIDE
3646 if (ch >= 0x10000) {
3647 /* code first surrogate */
3648 base64bits += 16;
3649 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3650 while (base64bits >= 6) {
3651 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3652 base64bits -= 6;
3653 }
3654 /* prepare second surrogate */
3655 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3656 }
3657#endif
3658 base64bits += 16;
3659 base64buffer = (base64buffer << 16) | ch;
3660 while (base64bits >= 6) {
3661 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3662 base64bits -= 6;
3663 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003664 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003665 if (base64bits)
3666 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3667 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003668 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003669 if (_PyBytes_Resize(&v, out - start) < 0)
3670 return NULL;
3671 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003672}
3673
Antoine Pitrou244651a2009-05-04 18:56:13 +00003674#undef IS_BASE64
3675#undef FROM_BASE64
3676#undef TO_BASE64
3677#undef DECODE_DIRECT
3678#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680/* --- UTF-8 Codec -------------------------------------------------------- */
3681
Tim Petersced69f82003-09-16 20:30:58 +00003682static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003684 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3685 illegal prefix. See RFC 3629 for details */
3686 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3687 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003688 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3690 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3691 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3692 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003693 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3698 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3699 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3700 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3701 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702};
3703
Alexander Belopolsky40018472011-02-26 01:02:56 +00003704PyObject *
3705PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003706 Py_ssize_t size,
3707 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708{
Walter Dörwald69652032004-09-07 20:24:22 +00003709 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3710}
3711
Antoine Pitrouab868312009-01-10 15:40:25 +00003712/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3713#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3714
3715/* Mask to quickly check whether a C 'long' contains a
3716 non-ASCII, UTF8-encoded char. */
3717#if (SIZEOF_LONG == 8)
3718# define ASCII_CHAR_MASK 0x8080808080808080L
3719#elif (SIZEOF_LONG == 4)
3720# define ASCII_CHAR_MASK 0x80808080L
3721#else
3722# error C 'long' size should be either 4 or 8!
3723#endif
3724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725/* Scans a UTF-8 string and returns the maximum character to be expected,
3726 the size of the decoded unicode string and if any major errors were
3727 encountered.
3728
3729 This function does check basic UTF-8 sanity, it does however NOT CHECK
3730 if the string contains surrogates, and if all continuation bytes are
3731 within the correct ranges, these checks are performed in
3732 PyUnicode_DecodeUTF8Stateful.
3733
3734 If it sets has_errors to 1, it means the value of unicode_size and max_char
3735 will be bogus and you should not rely on useful information in them.
3736 */
3737static Py_UCS4
3738utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3739 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3740 int *has_errors)
3741{
3742 Py_ssize_t n;
3743 Py_ssize_t char_count = 0;
3744 Py_UCS4 max_char = 127, new_max;
3745 Py_UCS4 upper_bound;
3746 const unsigned char *p = (const unsigned char *)s;
3747 const unsigned char *end = p + string_size;
3748 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3749 int err = 0;
3750
3751 for (; p < end && !err; ++p, ++char_count) {
3752 /* Only check value if it's not a ASCII char... */
3753 if (*p < 0x80) {
3754 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3755 an explanation. */
3756 if (!((size_t) p & LONG_PTR_MASK)) {
3757 /* Help register allocation */
3758 register const unsigned char *_p = p;
3759 while (_p < aligned_end) {
3760 unsigned long value = *(unsigned long *) _p;
3761 if (value & ASCII_CHAR_MASK)
3762 break;
3763 _p += SIZEOF_LONG;
3764 char_count += SIZEOF_LONG;
3765 }
3766 p = _p;
3767 if (p == end)
3768 break;
3769 }
3770 }
3771 if (*p >= 0x80) {
3772 n = utf8_code_length[*p];
3773 new_max = max_char;
3774 switch (n) {
3775 /* invalid start byte */
3776 case 0:
3777 err = 1;
3778 break;
3779 case 2:
3780 /* Code points between 0x00FF and 0x07FF inclusive.
3781 Approximate the upper bound of the code point,
3782 if this flips over 255 we can be sure it will be more
3783 than 255 and the string will need 2 bytes per code coint,
3784 if it stays under or equal to 255, we can be sure 1 byte
3785 is enough.
3786 ((*p & 0b00011111) << 6) | 0b00111111 */
3787 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3788 if (max_char < upper_bound)
3789 new_max = upper_bound;
3790 /* Ensure we track at least that we left ASCII space. */
3791 if (new_max < 128)
3792 new_max = 128;
3793 break;
3794 case 3:
3795 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3796 always > 255 and <= 65535 and will always need 2 bytes. */
3797 if (max_char < 65535)
3798 new_max = 65535;
3799 break;
3800 case 4:
3801 /* Code point will be above 0xFFFF for sure in this case. */
3802 new_max = 65537;
3803 break;
3804 /* Internal error, this should be caught by the first if */
3805 case 1:
3806 default:
3807 assert(0 && "Impossible case in utf8_max_char_and_size");
3808 err = 1;
3809 }
3810 /* Instead of number of overall bytes for this code point,
3811 n containts the number of following bytes: */
3812 --n;
3813 /* Check if the follow up chars are all valid continuation bytes */
3814 if (n >= 1) {
3815 const unsigned char *cont;
3816 if ((p + n) >= end) {
3817 if (consumed == 0)
3818 /* incomplete data, non-incremental decoding */
3819 err = 1;
3820 break;
3821 }
3822 for (cont = p + 1; cont < (p + n); ++cont) {
3823 if ((*cont & 0xc0) != 0x80) {
3824 err = 1;
3825 break;
3826 }
3827 }
3828 p += n;
3829 }
3830 else
3831 err = 1;
3832 max_char = new_max;
3833 }
3834 }
3835
3836 if (unicode_size)
3837 *unicode_size = char_count;
3838 if (has_errors)
3839 *has_errors = err;
3840 return max_char;
3841}
3842
3843/* Similar to PyUnicode_WRITE but can also write into wstr field
3844 of the legacy unicode representation */
3845#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3846 do { \
3847 const int k_ = (kind); \
3848 if (k_ == PyUnicode_WCHAR_KIND) \
3849 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3850 else if (k_ == PyUnicode_1BYTE_KIND) \
3851 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3852 else if (k_ == PyUnicode_2BYTE_KIND) \
3853 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3854 else \
3855 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3856 } while (0)
3857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858PyObject *
3859PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 Py_ssize_t size,
3861 const char *errors,
3862 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00003866 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003867 Py_ssize_t startinpos;
3868 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00003869 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003870 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003871 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 PyObject *errorHandler = NULL;
3873 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 Py_UCS4 maxchar = 0;
3875 Py_ssize_t unicode_size;
3876 Py_ssize_t i;
3877 int kind;
3878 void *data;
3879 int has_errors;
3880 Py_UNICODE *error_outptr;
3881#if SIZEOF_WCHAR_T == 2
3882 Py_ssize_t wchar_offset = 0;
3883#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884
Walter Dörwald69652032004-09-07 20:24:22 +00003885 if (size == 0) {
3886 if (consumed)
3887 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00003889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3891 consumed, &has_errors);
3892 if (has_errors) {
3893 unicode = _PyUnicode_New(size);
3894 if (!unicode)
3895 return NULL;
3896 kind = PyUnicode_WCHAR_KIND;
3897 data = PyUnicode_AS_UNICODE(unicode);
3898 assert(data != NULL);
3899 }
3900 else {
3901 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3902 if (!unicode)
3903 return NULL;
3904 /* When the string is ASCII only, just use memcpy and return.
3905 unicode_size may be != size if there is an incomplete UTF-8
3906 sequence at the end of the ASCII block. */
3907 if (maxchar < 128 && size == unicode_size) {
3908 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3909 return (PyObject *)unicode;
3910 }
3911 kind = PyUnicode_KIND(unicode);
3912 data = PyUnicode_DATA(unicode);
3913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00003917 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918
3919 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003920 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921
3922 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00003923 /* Fast path for runs of ASCII characters. Given that common UTF-8
3924 input will consist of an overwhelming majority of ASCII
3925 characters, we try to optimize for this case by checking
3926 as many characters as a C 'long' can contain.
3927 First, check if we can do an aligned read, as most CPUs have
3928 a penalty for unaligned reads.
3929 */
3930 if (!((size_t) s & LONG_PTR_MASK)) {
3931 /* Help register allocation */
3932 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003934 while (_s < aligned_end) {
3935 /* Read a whole long at a time (either 4 or 8 bytes),
3936 and do a fast unrolled copy if it only contains ASCII
3937 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 unsigned long value = *(unsigned long *) _s;
3939 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00003940 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
3942 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
3943 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
3944 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003945#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
3947 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
3948 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
3949 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00003950#endif
3951 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00003953 }
3954 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00003956 if (s == e)
3957 break;
3958 ch = (unsigned char)*s;
3959 }
3960 }
3961
3962 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 s++;
3965 continue;
3966 }
3967
3968 n = utf8_code_length[ch];
3969
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003970 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 if (consumed)
3972 break;
3973 else {
3974 errmsg = "unexpected end of data";
3975 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00003976 endinpos = startinpos+1;
3977 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
3978 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 goto utf8Error;
3980 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003981 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982
3983 switch (n) {
3984
3985 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00003986 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00003987 startinpos = s-starts;
3988 endinpos = startinpos+1;
3989 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990
3991 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003992 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 startinpos = s-starts;
3994 endinpos = startinpos+1;
3995 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
3997 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003998 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00003999 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004001 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 goto utf8Error;
4003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004005 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 break;
4008
4009 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004010 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4011 will result in surrogates in range d800-dfff. Surrogates are
4012 not valid UTF-8 so they are rejected.
4013 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4014 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004015 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004016 (s[2] & 0xc0) != 0x80 ||
4017 ((unsigned char)s[0] == 0xE0 &&
4018 (unsigned char)s[1] < 0xA0) ||
4019 ((unsigned char)s[0] == 0xED &&
4020 (unsigned char)s[1] > 0x9F)) {
4021 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004023 endinpos = startinpos + 1;
4024
4025 /* if s[1] first two bits are 1 and 0, then the invalid
4026 continuation byte is s[2], so increment endinpos by 1,
4027 if not, s[1] is invalid and endinpos doesn't need to
4028 be incremented. */
4029 if ((s[1] & 0xC0) == 0x80)
4030 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 goto utf8Error;
4032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004033 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004034 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004035 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004036 break;
4037
4038 case 4:
4039 if ((s[1] & 0xc0) != 0x80 ||
4040 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004041 (s[3] & 0xc0) != 0x80 ||
4042 ((unsigned char)s[0] == 0xF0 &&
4043 (unsigned char)s[1] < 0x90) ||
4044 ((unsigned char)s[0] == 0xF4 &&
4045 (unsigned char)s[1] > 0x8F)) {
4046 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004048 endinpos = startinpos + 1;
4049 if ((s[1] & 0xC0) == 0x80) {
4050 endinpos++;
4051 if ((s[2] & 0xC0) == 0x80)
4052 endinpos++;
4053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004054 goto utf8Error;
4055 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004056 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004057 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4058 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 /* If the string is flexible or we have native UCS-4, write
4061 directly.. */
4062 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4063 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 else {
4066 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 /* translate from 10000..10FFFF to 0..FFFF */
4069 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 /* high surrogate = top 10 bits added to D800 */
4072 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4073 (Py_UNICODE)(0xD800 + (ch >> 10)));
4074
4075 /* low surrogate = bottom 10 bits added to DC00 */
4076 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4077 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4078 }
4079#if SIZEOF_WCHAR_T == 2
4080 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004081#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
4084 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004085 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004086
Benjamin Peterson29060642009-01-31 22:14:21 +00004087 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004088 /* If this is not yet a resizable string, make it one.. */
4089 if (kind != PyUnicode_WCHAR_KIND) {
4090 const Py_UNICODE *u;
4091 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4092 if (!new_unicode)
4093 goto onError;
4094 u = PyUnicode_AsUnicode((PyObject *)unicode);
4095 if (!u)
4096 goto onError;
4097#if SIZEOF_WCHAR_T == 2
4098 i += wchar_offset;
4099#endif
4100 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4101 Py_DECREF(unicode);
4102 unicode = new_unicode;
4103 kind = 0;
4104 data = PyUnicode_AS_UNICODE(new_unicode);
4105 assert(data != NULL);
4106 }
4107 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 if (unicode_decode_call_errorhandler(
4109 errors, &errorHandler,
4110 "utf8", errmsg,
4111 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004112 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004113 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004114 /* Update data because unicode_decode_call_errorhandler might have
4115 re-created or resized the unicode object. */
4116 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004117 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004119 /* Ensure the unicode_size calculation above was correct: */
4120 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4121
Walter Dörwald69652032004-09-07 20:24:22 +00004122 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 /* Adjust length and ready string when it contained errors and
4126 is of the old resizable kind. */
4127 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02004128 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129 PyUnicode_READY(unicode) == -1)
4130 goto onError;
4131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 Py_XDECREF(errorHandler);
4134 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135 if (PyUnicode_READY(unicode) == -1) {
4136 Py_DECREF(unicode);
4137 return NULL;
4138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 return (PyObject *)unicode;
4140
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 Py_XDECREF(errorHandler);
4143 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 Py_DECREF(unicode);
4145 return NULL;
4146}
4147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004149
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004150#ifdef __APPLE__
4151
4152/* Simplified UTF-8 decoder using surrogateescape error handler,
4153 used to decode the command line arguments on Mac OS X. */
4154
4155wchar_t*
4156_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4157{
4158 int n;
4159 const char *e;
4160 wchar_t *unicode, *p;
4161
4162 /* Note: size will always be longer than the resulting Unicode
4163 character count */
4164 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4165 PyErr_NoMemory();
4166 return NULL;
4167 }
4168 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4169 if (!unicode)
4170 return NULL;
4171
4172 /* Unpack UTF-8 encoded data */
4173 p = unicode;
4174 e = s + size;
4175 while (s < e) {
4176 Py_UCS4 ch = (unsigned char)*s;
4177
4178 if (ch < 0x80) {
4179 *p++ = (wchar_t)ch;
4180 s++;
4181 continue;
4182 }
4183
4184 n = utf8_code_length[ch];
4185 if (s + n > e) {
4186 goto surrogateescape;
4187 }
4188
4189 switch (n) {
4190 case 0:
4191 case 1:
4192 goto surrogateescape;
4193
4194 case 2:
4195 if ((s[1] & 0xc0) != 0x80)
4196 goto surrogateescape;
4197 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4198 assert ((ch > 0x007F) && (ch <= 0x07FF));
4199 *p++ = (wchar_t)ch;
4200 break;
4201
4202 case 3:
4203 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4204 will result in surrogates in range d800-dfff. Surrogates are
4205 not valid UTF-8 so they are rejected.
4206 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4207 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4208 if ((s[1] & 0xc0) != 0x80 ||
4209 (s[2] & 0xc0) != 0x80 ||
4210 ((unsigned char)s[0] == 0xE0 &&
4211 (unsigned char)s[1] < 0xA0) ||
4212 ((unsigned char)s[0] == 0xED &&
4213 (unsigned char)s[1] > 0x9F)) {
4214
4215 goto surrogateescape;
4216 }
4217 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4218 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004220 break;
4221
4222 case 4:
4223 if ((s[1] & 0xc0) != 0x80 ||
4224 (s[2] & 0xc0) != 0x80 ||
4225 (s[3] & 0xc0) != 0x80 ||
4226 ((unsigned char)s[0] == 0xF0 &&
4227 (unsigned char)s[1] < 0x90) ||
4228 ((unsigned char)s[0] == 0xF4 &&
4229 (unsigned char)s[1] > 0x8F)) {
4230 goto surrogateescape;
4231 }
4232 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4233 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4234 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4235
4236#if SIZEOF_WCHAR_T == 4
4237 *p++ = (wchar_t)ch;
4238#else
4239 /* compute and append the two surrogates: */
4240
4241 /* translate from 10000..10FFFF to 0..FFFF */
4242 ch -= 0x10000;
4243
4244 /* high surrogate = top 10 bits added to D800 */
4245 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4246
4247 /* low surrogate = bottom 10 bits added to DC00 */
4248 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4249#endif
4250 break;
4251 }
4252 s += n;
4253 continue;
4254
4255 surrogateescape:
4256 *p++ = 0xDC00 + ch;
4257 s++;
4258 }
4259 *p = L'\0';
4260 return unicode;
4261}
4262
4263#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004265/* Primary internal function which creates utf8 encoded bytes objects.
4266
4267 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004268 and allocate exactly as much space needed at the end. Else allocate the
4269 maximum possible needed (4 result bytes per Unicode character), and return
4270 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004271*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004272PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004273_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274{
Tim Peters602f7402002-04-27 18:03:26 +00004275#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004276
Guido van Rossum98297ee2007-11-06 21:34:58 +00004277 Py_ssize_t i; /* index into s of next input byte */
4278 PyObject *result; /* result string object */
4279 char *p; /* next free byte in output buffer */
4280 Py_ssize_t nallocated; /* number of result bytes allocated */
4281 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004282 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004283 PyObject *errorHandler = NULL;
4284 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004285 int kind;
4286 void *data;
4287 Py_ssize_t size;
4288 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4289#if SIZEOF_WCHAR_T == 2
4290 Py_ssize_t wchar_offset = 0;
4291#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 if (!PyUnicode_Check(unicode)) {
4294 PyErr_BadArgument();
4295 return NULL;
4296 }
4297
4298 if (PyUnicode_READY(unicode) == -1)
4299 return NULL;
4300
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004301 if (PyUnicode_UTF8(unicode))
4302 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4303 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004304
4305 kind = PyUnicode_KIND(unicode);
4306 data = PyUnicode_DATA(unicode);
4307 size = PyUnicode_GET_LENGTH(unicode);
4308
Tim Peters602f7402002-04-27 18:03:26 +00004309 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310
Tim Peters602f7402002-04-27 18:03:26 +00004311 if (size <= MAX_SHORT_UNICHARS) {
4312 /* Write into the stack buffer; nallocated can't overflow.
4313 * At the end, we'll allocate exactly as much heap space as it
4314 * turns out we need.
4315 */
4316 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004317 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004318 p = stackbuf;
4319 }
4320 else {
4321 /* Overallocate on the heap, and give the excess back at the end. */
4322 nallocated = size * 4;
4323 if (nallocated / 4 != size) /* overflow! */
4324 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004325 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004326 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004327 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004328 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004329 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004330
Tim Peters602f7402002-04-27 18:03:26 +00004331 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004333
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004334 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004335 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004337
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004339 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004340 *p++ = (char)(0xc0 | (ch >> 6));
4341 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004342 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 Py_ssize_t newpos;
4344 PyObject *rep;
4345 Py_ssize_t repsize, k, startpos;
4346 startpos = i-1;
4347#if SIZEOF_WCHAR_T == 2
4348 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004349#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004350 rep = unicode_encode_call_errorhandler(
4351 errors, &errorHandler, "utf-8", "surrogates not allowed",
4352 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4353 &exc, startpos, startpos+1, &newpos);
4354 if (!rep)
4355 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 if (PyBytes_Check(rep))
4358 repsize = PyBytes_GET_SIZE(rep);
4359 else
4360 repsize = PyUnicode_GET_SIZE(rep);
4361
4362 if (repsize > 4) {
4363 Py_ssize_t offset;
4364
4365 if (result == NULL)
4366 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004367 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004368 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004370 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4371 /* integer overflow */
4372 PyErr_NoMemory();
4373 goto error;
4374 }
4375 nallocated += repsize - 4;
4376 if (result != NULL) {
4377 if (_PyBytes_Resize(&result, nallocated) < 0)
4378 goto error;
4379 } else {
4380 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004381 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 goto error;
4383 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4384 }
4385 p = PyBytes_AS_STRING(result) + offset;
4386 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004388 if (PyBytes_Check(rep)) {
4389 char *prep = PyBytes_AS_STRING(rep);
4390 for(k = repsize; k > 0; k--)
4391 *p++ = *prep++;
4392 } else /* rep is unicode */ {
4393 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4394 Py_UNICODE c;
4395
4396 for(k=0; k<repsize; k++) {
4397 c = prep[k];
4398 if (0x80 <= c) {
4399 raise_encode_exception(&exc, "utf-8",
4400 PyUnicode_AS_UNICODE(unicode),
4401 size, i-1, i,
4402 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004403 goto error;
4404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004405 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004406 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004408 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004409 } else if (ch < 0x10000) {
4410 *p++ = (char)(0xe0 | (ch >> 12));
4411 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4412 *p++ = (char)(0x80 | (ch & 0x3f));
4413 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004414 /* Encode UCS4 Unicode ordinals */
4415 *p++ = (char)(0xf0 | (ch >> 18));
4416 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4417 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4418 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004419#if SIZEOF_WCHAR_T == 2
4420 wchar_offset++;
4421#endif
Tim Peters602f7402002-04-27 18:03:26 +00004422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004424
Guido van Rossum98297ee2007-11-06 21:34:58 +00004425 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004426 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004427 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004428 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004429 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004430 }
4431 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004432 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004433 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004434 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004435 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004436 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004437
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004438 Py_XDECREF(errorHandler);
4439 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004440 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004441 error:
4442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
4444 Py_XDECREF(result);
4445 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004446
Tim Peters602f7402002-04-27 18:03:26 +00004447#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448}
4449
Alexander Belopolsky40018472011-02-26 01:02:56 +00004450PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004451PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4452 Py_ssize_t size,
4453 const char *errors)
4454{
4455 PyObject *v, *unicode;
4456
4457 unicode = PyUnicode_FromUnicode(s, size);
4458 if (unicode == NULL)
4459 return NULL;
4460 v = _PyUnicode_AsUTF8String(unicode, errors);
4461 Py_DECREF(unicode);
4462 return v;
4463}
4464
4465PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004466PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004468 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469}
4470
Walter Dörwald41980ca2007-08-16 21:55:45 +00004471/* --- UTF-32 Codec ------------------------------------------------------- */
4472
4473PyObject *
4474PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004475 Py_ssize_t size,
4476 const char *errors,
4477 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004478{
4479 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4480}
4481
4482PyObject *
4483PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004484 Py_ssize_t size,
4485 const char *errors,
4486 int *byteorder,
4487 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004488{
4489 const char *starts = s;
4490 Py_ssize_t startinpos;
4491 Py_ssize_t endinpos;
4492 Py_ssize_t outpos;
4493 PyUnicodeObject *unicode;
4494 Py_UNICODE *p;
4495#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004496 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004497 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004498#else
4499 const int pairs = 0;
4500#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004501 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004502 int bo = 0; /* assume native ordering by default */
4503 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004504 /* Offsets from q for retrieving bytes in the right order. */
4505#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4506 int iorder[] = {0, 1, 2, 3};
4507#else
4508 int iorder[] = {3, 2, 1, 0};
4509#endif
4510 PyObject *errorHandler = NULL;
4511 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004512
Walter Dörwald41980ca2007-08-16 21:55:45 +00004513 q = (unsigned char *)s;
4514 e = q + size;
4515
4516 if (byteorder)
4517 bo = *byteorder;
4518
4519 /* Check for BOM marks (U+FEFF) in the input and adjust current
4520 byte order setting accordingly. In native mode, the leading BOM
4521 mark is skipped, in all other modes, it is copied to the output
4522 stream as-is (giving a ZWNBSP character). */
4523 if (bo == 0) {
4524 if (size >= 4) {
4525 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004527#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 if (bom == 0x0000FEFF) {
4529 q += 4;
4530 bo = -1;
4531 }
4532 else if (bom == 0xFFFE0000) {
4533 q += 4;
4534 bo = 1;
4535 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004536#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 if (bom == 0x0000FEFF) {
4538 q += 4;
4539 bo = 1;
4540 }
4541 else if (bom == 0xFFFE0000) {
4542 q += 4;
4543 bo = -1;
4544 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004545#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004547 }
4548
4549 if (bo == -1) {
4550 /* force LE */
4551 iorder[0] = 0;
4552 iorder[1] = 1;
4553 iorder[2] = 2;
4554 iorder[3] = 3;
4555 }
4556 else if (bo == 1) {
4557 /* force BE */
4558 iorder[0] = 3;
4559 iorder[1] = 2;
4560 iorder[2] = 1;
4561 iorder[3] = 0;
4562 }
4563
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004564 /* On narrow builds we split characters outside the BMP into two
4565 codepoints => count how much extra space we need. */
4566#ifndef Py_UNICODE_WIDE
4567 for (qq = q; qq < e; qq += 4)
4568 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4569 pairs++;
4570#endif
4571
4572 /* This might be one to much, because of a BOM */
4573 unicode = _PyUnicode_New((size+3)/4+pairs);
4574 if (!unicode)
4575 return NULL;
4576 if (size == 0)
4577 return (PyObject *)unicode;
4578
4579 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004580 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004581
Walter Dörwald41980ca2007-08-16 21:55:45 +00004582 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 Py_UCS4 ch;
4584 /* remaining bytes at the end? (size should be divisible by 4) */
4585 if (e-q<4) {
4586 if (consumed)
4587 break;
4588 errmsg = "truncated data";
4589 startinpos = ((const char *)q)-starts;
4590 endinpos = ((const char *)e)-starts;
4591 goto utf32Error;
4592 /* The remaining input chars are ignored if the callback
4593 chooses to skip the input */
4594 }
4595 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4596 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004597
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 if (ch >= 0x110000)
4599 {
4600 errmsg = "codepoint not in range(0x110000)";
4601 startinpos = ((const char *)q)-starts;
4602 endinpos = startinpos+4;
4603 goto utf32Error;
4604 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004605#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 if (ch >= 0x10000)
4607 {
4608 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4609 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4610 }
4611 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004612#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 *p++ = ch;
4614 q += 4;
4615 continue;
4616 utf32Error:
4617 outpos = p-PyUnicode_AS_UNICODE(unicode);
4618 if (unicode_decode_call_errorhandler(
4619 errors, &errorHandler,
4620 "utf32", errmsg,
4621 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4622 &unicode, &outpos, &p))
4623 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004624 }
4625
4626 if (byteorder)
4627 *byteorder = bo;
4628
4629 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004630 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004631
4632 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004633 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004634 goto onError;
4635
4636 Py_XDECREF(errorHandler);
4637 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004638 if (PyUnicode_READY(unicode) == -1) {
4639 Py_DECREF(unicode);
4640 return NULL;
4641 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004642 return (PyObject *)unicode;
4643
Benjamin Peterson29060642009-01-31 22:14:21 +00004644 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004645 Py_DECREF(unicode);
4646 Py_XDECREF(errorHandler);
4647 Py_XDECREF(exc);
4648 return NULL;
4649}
4650
4651PyObject *
4652PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 Py_ssize_t size,
4654 const char *errors,
4655 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004656{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004657 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004658 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004659 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004660#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004661 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004662#else
4663 const int pairs = 0;
4664#endif
4665 /* Offsets from p for storing byte pairs in the right order. */
4666#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4667 int iorder[] = {0, 1, 2, 3};
4668#else
4669 int iorder[] = {3, 2, 1, 0};
4670#endif
4671
Benjamin Peterson29060642009-01-31 22:14:21 +00004672#define STORECHAR(CH) \
4673 do { \
4674 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4675 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4676 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4677 p[iorder[0]] = (CH) & 0xff; \
4678 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004679 } while(0)
4680
4681 /* In narrow builds we can output surrogate pairs as one codepoint,
4682 so we need less space. */
4683#ifndef Py_UNICODE_WIDE
4684 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4686 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4687 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004688#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004689 nsize = (size - pairs + (byteorder == 0));
4690 bytesize = nsize * 4;
4691 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004692 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004693 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004694 if (v == NULL)
4695 return NULL;
4696
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004697 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004698 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004699 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004700 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004701 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004702
4703 if (byteorder == -1) {
4704 /* force LE */
4705 iorder[0] = 0;
4706 iorder[1] = 1;
4707 iorder[2] = 2;
4708 iorder[3] = 3;
4709 }
4710 else if (byteorder == 1) {
4711 /* force BE */
4712 iorder[0] = 3;
4713 iorder[1] = 2;
4714 iorder[2] = 1;
4715 iorder[3] = 0;
4716 }
4717
4718 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004720#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004721 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4722 Py_UCS4 ch2 = *s;
4723 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4724 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4725 s++;
4726 size--;
4727 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004728 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004729#endif
4730 STORECHAR(ch);
4731 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004732
4733 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004734 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004735#undef STORECHAR
4736}
4737
Alexander Belopolsky40018472011-02-26 01:02:56 +00004738PyObject *
4739PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004740{
4741 if (!PyUnicode_Check(unicode)) {
4742 PyErr_BadArgument();
4743 return NULL;
4744 }
4745 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 PyUnicode_GET_SIZE(unicode),
4747 NULL,
4748 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004749}
4750
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751/* --- UTF-16 Codec ------------------------------------------------------- */
4752
Tim Peters772747b2001-08-09 22:21:55 +00004753PyObject *
4754PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004755 Py_ssize_t size,
4756 const char *errors,
4757 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758{
Walter Dörwald69652032004-09-07 20:24:22 +00004759 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4760}
4761
Antoine Pitrouab868312009-01-10 15:40:25 +00004762/* Two masks for fast checking of whether a C 'long' may contain
4763 UTF16-encoded surrogate characters. This is an efficient heuristic,
4764 assuming that non-surrogate characters with a code point >= 0x8000 are
4765 rare in most input.
4766 FAST_CHAR_MASK is used when the input is in native byte ordering,
4767 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004768*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004769#if (SIZEOF_LONG == 8)
4770# define FAST_CHAR_MASK 0x8000800080008000L
4771# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4772#elif (SIZEOF_LONG == 4)
4773# define FAST_CHAR_MASK 0x80008000L
4774# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4775#else
4776# error C 'long' size should be either 4 or 8!
4777#endif
4778
Walter Dörwald69652032004-09-07 20:24:22 +00004779PyObject *
4780PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004781 Py_ssize_t size,
4782 const char *errors,
4783 int *byteorder,
4784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004787 Py_ssize_t startinpos;
4788 Py_ssize_t endinpos;
4789 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 PyUnicodeObject *unicode;
4791 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004792 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00004793 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00004794 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004795 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00004796 /* Offsets from q for retrieving byte pairs in the right order. */
4797#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4798 int ihi = 1, ilo = 0;
4799#else
4800 int ihi = 0, ilo = 1;
4801#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 PyObject *errorHandler = NULL;
4803 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804
4805 /* Note: size will always be longer than the resulting Unicode
4806 character count */
4807 unicode = _PyUnicode_New(size);
4808 if (!unicode)
4809 return NULL;
4810 if (size == 0)
4811 return (PyObject *)unicode;
4812
4813 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00004815 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00004816 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817
4818 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00004819 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004821 /* Check for BOM marks (U+FEFF) in the input and adjust current
4822 byte order setting accordingly. In native mode, the leading BOM
4823 mark is skipped, in all other modes, it is copied to the output
4824 stream as-is (giving a ZWNBSP character). */
4825 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00004826 if (size >= 2) {
4827 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004829 if (bom == 0xFEFF) {
4830 q += 2;
4831 bo = -1;
4832 }
4833 else if (bom == 0xFFFE) {
4834 q += 2;
4835 bo = 1;
4836 }
Tim Petersced69f82003-09-16 20:30:58 +00004837#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 if (bom == 0xFEFF) {
4839 q += 2;
4840 bo = 1;
4841 }
4842 else if (bom == 0xFFFE) {
4843 q += 2;
4844 bo = -1;
4845 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004846#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00004848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849
Tim Peters772747b2001-08-09 22:21:55 +00004850 if (bo == -1) {
4851 /* force LE */
4852 ihi = 1;
4853 ilo = 0;
4854 }
4855 else if (bo == 1) {
4856 /* force BE */
4857 ihi = 0;
4858 ilo = 1;
4859 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004860#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4861 native_ordering = ilo < ihi;
4862#else
4863 native_ordering = ilo > ihi;
4864#endif
Tim Peters772747b2001-08-09 22:21:55 +00004865
Antoine Pitrouab868312009-01-10 15:40:25 +00004866 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00004867 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00004869 /* First check for possible aligned read of a C 'long'. Unaligned
4870 reads are more expensive, better to defer to another iteration. */
4871 if (!((size_t) q & LONG_PTR_MASK)) {
4872 /* Fast path for runs of non-surrogate chars. */
4873 register const unsigned char *_q = q;
4874 Py_UNICODE *_p = p;
4875 if (native_ordering) {
4876 /* Native ordering is simple: as long as the input cannot
4877 possibly contain a surrogate char, do an unrolled copy
4878 of several 16-bit code points to the target object.
4879 The non-surrogate check is done on several input bytes
4880 at a time (as many as a C 'long' can contain). */
4881 while (_q < aligned_end) {
4882 unsigned long data = * (unsigned long *) _q;
4883 if (data & FAST_CHAR_MASK)
4884 break;
4885 _p[0] = ((unsigned short *) _q)[0];
4886 _p[1] = ((unsigned short *) _q)[1];
4887#if (SIZEOF_LONG == 8)
4888 _p[2] = ((unsigned short *) _q)[2];
4889 _p[3] = ((unsigned short *) _q)[3];
4890#endif
4891 _q += SIZEOF_LONG;
4892 _p += SIZEOF_LONG / 2;
4893 }
4894 }
4895 else {
4896 /* Byteswapped ordering is similar, but we must decompose
4897 the copy bytewise, and take care of zero'ing out the
4898 upper bytes if the target object is in 32-bit units
4899 (that is, in UCS-4 builds). */
4900 while (_q < aligned_end) {
4901 unsigned long data = * (unsigned long *) _q;
4902 if (data & SWAPPED_FAST_CHAR_MASK)
4903 break;
4904 /* Zero upper bytes in UCS-4 builds */
4905#if (Py_UNICODE_SIZE > 2)
4906 _p[0] = 0;
4907 _p[1] = 0;
4908#if (SIZEOF_LONG == 8)
4909 _p[2] = 0;
4910 _p[3] = 0;
4911#endif
4912#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004913 /* Issue #4916; UCS-4 builds on big endian machines must
4914 fill the two last bytes of each 4-byte unit. */
4915#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
4916# define OFF 2
4917#else
4918# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00004919#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00004920 ((unsigned char *) _p)[OFF + 1] = _q[0];
4921 ((unsigned char *) _p)[OFF + 0] = _q[1];
4922 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
4923 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
4924#if (SIZEOF_LONG == 8)
4925 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
4926 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
4927 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
4928 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
4929#endif
4930#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00004931 _q += SIZEOF_LONG;
4932 _p += SIZEOF_LONG / 2;
4933 }
4934 }
4935 p = _p;
4936 q = _q;
4937 if (q >= e)
4938 break;
4939 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941
Benjamin Peterson14339b62009-01-31 16:36:08 +00004942 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00004943
4944 if (ch < 0xD800 || ch > 0xDFFF) {
4945 *p++ = ch;
4946 continue;
4947 }
4948
4949 /* UTF-16 code pair: */
4950 if (q > e) {
4951 errmsg = "unexpected end of data";
4952 startinpos = (((const char *)q) - 2) - starts;
4953 endinpos = ((const char *)e) + 1 - starts;
4954 goto utf16Error;
4955 }
4956 if (0xD800 <= ch && ch <= 0xDBFF) {
4957 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
4958 q += 2;
4959 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00004960#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 *p++ = ch;
4962 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004963#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004965#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 continue;
4967 }
4968 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004969 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 startinpos = (((const char *)q)-4)-starts;
4971 endinpos = startinpos+2;
4972 goto utf16Error;
4973 }
4974
Benjamin Peterson14339b62009-01-31 16:36:08 +00004975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004976 errmsg = "illegal encoding";
4977 startinpos = (((const char *)q)-2)-starts;
4978 endinpos = startinpos+2;
4979 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004980
Benjamin Peterson29060642009-01-31 22:14:21 +00004981 utf16Error:
4982 outpos = p - PyUnicode_AS_UNICODE(unicode);
4983 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00004984 errors,
4985 &errorHandler,
4986 "utf16", errmsg,
4987 &starts,
4988 (const char **)&e,
4989 &startinpos,
4990 &endinpos,
4991 &exc,
4992 (const char **)&q,
4993 &unicode,
4994 &outpos,
4995 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997 }
Antoine Pitrouab868312009-01-10 15:40:25 +00004998 /* remaining byte at the end? (size should be even) */
4999 if (e == q) {
5000 if (!consumed) {
5001 errmsg = "truncated data";
5002 startinpos = ((const char *)q) - starts;
5003 endinpos = ((const char *)e) + 1 - starts;
5004 outpos = p - PyUnicode_AS_UNICODE(unicode);
5005 if (unicode_decode_call_errorhandler(
5006 errors,
5007 &errorHandler,
5008 "utf16", errmsg,
5009 &starts,
5010 (const char **)&e,
5011 &startinpos,
5012 &endinpos,
5013 &exc,
5014 (const char **)&q,
5015 &unicode,
5016 &outpos,
5017 &p))
5018 goto onError;
5019 /* The remaining input chars are ignored if the callback
5020 chooses to skip the input */
5021 }
5022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023
5024 if (byteorder)
5025 *byteorder = bo;
5026
Walter Dörwald69652032004-09-07 20:24:22 +00005027 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005029
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005031 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 goto onError;
5033
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005034 Py_XDECREF(errorHandler);
5035 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005036 if (PyUnicode_READY(unicode) == -1) {
5037 Py_DECREF(unicode);
5038 return NULL;
5039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 return (PyObject *)unicode;
5041
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return NULL;
5047}
5048
Antoine Pitrouab868312009-01-10 15:40:25 +00005049#undef FAST_CHAR_MASK
5050#undef SWAPPED_FAST_CHAR_MASK
5051
Tim Peters772747b2001-08-09 22:21:55 +00005052PyObject *
5053PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 Py_ssize_t size,
5055 const char *errors,
5056 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005058 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005059 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005060 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005061#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005062 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005063#else
5064 const int pairs = 0;
5065#endif
Tim Peters772747b2001-08-09 22:21:55 +00005066 /* Offsets from p for storing byte pairs in the right order. */
5067#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5068 int ihi = 1, ilo = 0;
5069#else
5070 int ihi = 0, ilo = 1;
5071#endif
5072
Benjamin Peterson29060642009-01-31 22:14:21 +00005073#define STORECHAR(CH) \
5074 do { \
5075 p[ihi] = ((CH) >> 8) & 0xff; \
5076 p[ilo] = (CH) & 0xff; \
5077 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005078 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005080#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005081 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 if (s[i] >= 0x10000)
5083 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005084#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005085 /* 2 * (size + pairs + (byteorder == 0)) */
5086 if (size > PY_SSIZE_T_MAX ||
5087 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005089 nsize = size + pairs + (byteorder == 0);
5090 bytesize = nsize * 2;
5091 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005093 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 if (v == NULL)
5095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005100 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005101 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005102
5103 if (byteorder == -1) {
5104 /* force LE */
5105 ihi = 1;
5106 ilo = 0;
5107 }
5108 else if (byteorder == 1) {
5109 /* force BE */
5110 ihi = 0;
5111 ilo = 1;
5112 }
5113
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005114 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 Py_UNICODE ch = *s++;
5116 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005117#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 if (ch >= 0x10000) {
5119 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5120 ch = 0xD800 | ((ch-0x10000) >> 10);
5121 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005122#endif
Tim Peters772747b2001-08-09 22:21:55 +00005123 STORECHAR(ch);
5124 if (ch2)
5125 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005126 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005127
5128 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005129 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005130#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131}
5132
Alexander Belopolsky40018472011-02-26 01:02:56 +00005133PyObject *
5134PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135{
5136 if (!PyUnicode_Check(unicode)) {
5137 PyErr_BadArgument();
5138 return NULL;
5139 }
5140 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 PyUnicode_GET_SIZE(unicode),
5142 NULL,
5143 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005144}
5145
5146/* --- Unicode Escape Codec ----------------------------------------------- */
5147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005148/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5149 if all the escapes in the string make it still a valid ASCII string.
5150 Returns -1 if any escapes were found which cause the string to
5151 pop out of ASCII range. Otherwise returns the length of the
5152 required buffer to hold the string.
5153 */
5154Py_ssize_t
5155length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5156{
5157 const unsigned char *p = (const unsigned char *)s;
5158 const unsigned char *end = p + size;
5159 Py_ssize_t length = 0;
5160
5161 if (size < 0)
5162 return -1;
5163
5164 for (; p < end; ++p) {
5165 if (*p > 127) {
5166 /* Non-ASCII */
5167 return -1;
5168 }
5169 else if (*p != '\\') {
5170 /* Normal character */
5171 ++length;
5172 }
5173 else {
5174 /* Backslash-escape, check next char */
5175 ++p;
5176 /* Escape sequence reaches till end of string or
5177 non-ASCII follow-up. */
5178 if (p >= end || *p > 127)
5179 return -1;
5180 switch (*p) {
5181 case '\n':
5182 /* backslash + \n result in zero characters */
5183 break;
5184 case '\\': case '\'': case '\"':
5185 case 'b': case 'f': case 't':
5186 case 'n': case 'r': case 'v': case 'a':
5187 ++length;
5188 break;
5189 case '0': case '1': case '2': case '3':
5190 case '4': case '5': case '6': case '7':
5191 case 'x': case 'u': case 'U': case 'N':
5192 /* these do not guarantee ASCII characters */
5193 return -1;
5194 default:
5195 /* count the backslash + the other character */
5196 length += 2;
5197 }
5198 }
5199 }
5200 return length;
5201}
5202
5203/* Similar to PyUnicode_WRITE but either write into wstr field
5204 or treat string as ASCII. */
5205#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5206 do { \
5207 if ((kind) != PyUnicode_WCHAR_KIND) \
5208 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5209 else \
5210 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5211 } while (0)
5212
5213#define WRITE_WSTR(buf, index, value) \
5214 assert(kind == PyUnicode_WCHAR_KIND), \
5215 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5216
5217
Fredrik Lundh06d12682001-01-24 07:59:11 +00005218static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005219
Alexander Belopolsky40018472011-02-26 01:02:56 +00005220PyObject *
5221PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005222 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005223 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005225 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005226 Py_ssize_t startinpos;
5227 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005228 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005232 char* message;
5233 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 PyObject *errorHandler = NULL;
5235 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005236 Py_ssize_t ascii_length;
5237 Py_ssize_t i;
5238 int kind;
5239 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005241 ascii_length = length_of_escaped_ascii_string(s, size);
5242
5243 /* After length_of_escaped_ascii_string() there are two alternatives,
5244 either the string is pure ASCII with named escapes like \n, etc.
5245 and we determined it's exact size (common case)
5246 or it contains \x, \u, ... escape sequences. then we create a
5247 legacy wchar string and resize it at the end of this function. */
5248 if (ascii_length >= 0) {
5249 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5250 if (!v)
5251 goto onError;
5252 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5253 kind = PyUnicode_1BYTE_KIND;
5254 data = PyUnicode_DATA(v);
5255 }
5256 else {
5257 /* Escaped strings will always be longer than the resulting
5258 Unicode string, so we start with size here and then reduce the
5259 length after conversion to the true value.
5260 (but if the error callback returns a long replacement string
5261 we'll have to allocate more space) */
5262 v = _PyUnicode_New(size);
5263 if (!v)
5264 goto onError;
5265 kind = PyUnicode_WCHAR_KIND;
5266 data = PyUnicode_AS_UNICODE(v);
5267 }
5268
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 if (size == 0)
5270 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005271 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 while (s < end) {
5275 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005276 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005277 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005279 if (kind == PyUnicode_WCHAR_KIND) {
5280 assert(i < _PyUnicode_WSTR_LENGTH(v));
5281 }
5282 else {
5283 /* The only case in which i == ascii_length is a backslash
5284 followed by a newline. */
5285 assert(i <= ascii_length);
5286 }
5287
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 /* Non-escape characters are interpreted as Unicode ordinals */
5289 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005290 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291 continue;
5292 }
5293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005294 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 /* \ - Escapes */
5296 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005297 c = *s++;
5298 if (s > end)
5299 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005300
5301 if (kind == PyUnicode_WCHAR_KIND) {
5302 assert(i < _PyUnicode_WSTR_LENGTH(v));
5303 }
5304 else {
5305 /* The only case in which i == ascii_length is a backslash
5306 followed by a newline. */
5307 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5308 }
5309
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005310 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005314 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5315 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5316 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5317 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5318 /* FF */
5319 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5320 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5321 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5322 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5323 /* VT */
5324 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5325 /* BEL, not classic C */
5326 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 case '0': case '1': case '2': case '3':
5330 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005331 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005332 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005333 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005334 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005335 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005337 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 break;
5339
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 /* hex escapes */
5341 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005343 digits = 2;
5344 message = "truncated \\xXX escape";
5345 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
Benjamin Peterson29060642009-01-31 22:14:21 +00005347 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005349 digits = 4;
5350 message = "truncated \\uXXXX escape";
5351 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005354 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005355 digits = 8;
5356 message = "truncated \\UXXXXXXXX escape";
5357 hexescape:
5358 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 if (s+digits>end) {
5361 endinpos = size;
5362 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 errors, &errorHandler,
5364 "unicodeescape", "end of string in escape sequence",
5365 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005366 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005368 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005369 goto nextByte;
5370 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 for (j = 0; j < digits; ++j) {
5372 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005373 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005374 endinpos = (s+j+1)-starts;
5375 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005376 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 errors, &errorHandler,
5378 "unicodeescape", message,
5379 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005380 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005381 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005382 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005383 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005384 }
5385 chr = (chr<<4) & ~0xF;
5386 if (c >= '0' && c <= '9')
5387 chr += c - '0';
5388 else if (c >= 'a' && c <= 'f')
5389 chr += 10 + c - 'a';
5390 else
5391 chr += 10 + c - 'A';
5392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005394 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005395 /* _decoding_error will have already written into the
5396 target buffer. */
5397 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005398 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005399 /* when we get here, chr is a 32-bit unicode character */
5400 if (chr <= 0xffff)
5401 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005403 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005404 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005405 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005406#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005407 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005408#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005409 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005410 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5411 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005412#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005413 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005414 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005415 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 errors, &errorHandler,
5418 "unicodeescape", "illegal Unicode character",
5419 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005421 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005423 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005424 break;
5425
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005427 case 'N':
5428 message = "malformed \\N character escape";
5429 if (ucnhash_CAPI == NULL) {
5430 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5432 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005433 if (ucnhash_CAPI == NULL)
5434 goto ucnhashError;
5435 }
5436 if (*s == '{') {
5437 const char *start = s+1;
5438 /* look for the closing brace */
5439 while (*s != '}' && s < end)
5440 s++;
5441 if (s > start && s < end && *s == '}') {
5442 /* found a name. look it up in the unicode database */
5443 message = "unknown Unicode character name";
5444 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005445 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5446 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005447 goto store;
5448 }
5449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005451 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005452 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005453 errors, &errorHandler,
5454 "unicodeescape", message,
5455 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005457 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005458 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005459 break;
5460
5461 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005462 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005463 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005464 message = "\\ at end of string";
5465 s--;
5466 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005467 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005468 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 errors, &errorHandler,
5470 "unicodeescape", message,
5471 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005473 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005474 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005475 }
5476 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5478 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005479 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005480 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005483 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005485 /* Ensure the length prediction worked in case of ASCII strings */
5486 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5487
Victor Stinnerfe226c02011-10-03 03:52:20 +02005488 if (kind == PyUnicode_WCHAR_KIND)
5489 {
5490 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5491 goto onError;
5492 if (PyUnicode_READY(v) == -1)
5493 goto onError;
5494 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005495 Py_XDECREF(errorHandler);
5496 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005498
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005500 PyErr_SetString(
5501 PyExc_UnicodeError,
5502 "\\N escapes not supported (can't load unicodedata module)"
5503 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005504 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 Py_XDECREF(errorHandler);
5506 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005507 return NULL;
5508
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 Py_XDECREF(errorHandler);
5512 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 return NULL;
5514}
5515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005516#undef WRITE_ASCII_OR_WSTR
5517#undef WRITE_WSTR
5518
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519/* Return a Unicode-Escape string version of the Unicode object.
5520
5521 If quotes is true, the string is enclosed in u"" or u'' quotes as
5522 appropriate.
5523
5524*/
5525
Walter Dörwald79e913e2007-05-12 11:08:06 +00005526static const char *hexdigits = "0123456789abcdef";
5527
Alexander Belopolsky40018472011-02-26 01:02:56 +00005528PyObject *
5529PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005530 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005532 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005535#ifdef Py_UNICODE_WIDE
5536 const Py_ssize_t expandsize = 10;
5537#else
5538 const Py_ssize_t expandsize = 6;
5539#endif
5540
Thomas Wouters89f507f2006-12-13 04:49:30 +00005541 /* XXX(nnorwitz): rather than over-allocating, it would be
5542 better to choose a different scheme. Perhaps scan the
5543 first N-chars of the string and allocate based on that size.
5544 */
5545 /* Initial allocation is based on the longest-possible unichr
5546 escape.
5547
5548 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5549 unichr, so in this case it's the longest unichr escape. In
5550 narrow (UTF-16) builds this is five chars per source unichr
5551 since there are two unichrs in the surrogate pair, so in narrow
5552 (UTF-16) builds it's not the longest unichr escape.
5553
5554 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5555 so in the narrow (UTF-16) build case it's the longest unichr
5556 escape.
5557 */
5558
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005559 if (size == 0)
5560 return PyBytes_FromStringAndSize(NULL, 0);
5561
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005562 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005563 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005564
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005565 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005566 2
5567 + expandsize*size
5568 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 if (repr == NULL)
5570 return NULL;
5571
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005572 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574 while (size-- > 0) {
5575 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005576
Walter Dörwald79e913e2007-05-12 11:08:06 +00005577 /* Escape backslashes */
5578 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 *p++ = '\\';
5580 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005581 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005582 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005583
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005584#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005585 /* Map 21-bit characters to '\U00xxxxxx' */
5586 else if (ch >= 0x10000) {
5587 *p++ = '\\';
5588 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005589 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5590 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5591 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5592 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5593 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5594 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5595 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5596 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005598 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005599#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005600 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5601 else if (ch >= 0xD800 && ch < 0xDC00) {
5602 Py_UNICODE ch2;
5603 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 ch2 = *s++;
5606 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005607 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5609 *p++ = '\\';
5610 *p++ = 'U';
5611 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5612 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5613 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5614 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5615 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5616 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5617 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5618 *p++ = hexdigits[ucs & 0x0000000F];
5619 continue;
5620 }
5621 /* Fall through: isolated surrogates are copied as-is */
5622 s--;
5623 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005624 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005625#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005628 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 *p++ = '\\';
5630 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005631 *p++ = hexdigits[(ch >> 12) & 0x000F];
5632 *p++ = hexdigits[(ch >> 8) & 0x000F];
5633 *p++ = hexdigits[(ch >> 4) & 0x000F];
5634 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005636
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005637 /* Map special whitespace to '\t', \n', '\r' */
5638 else if (ch == '\t') {
5639 *p++ = '\\';
5640 *p++ = 't';
5641 }
5642 else if (ch == '\n') {
5643 *p++ = '\\';
5644 *p++ = 'n';
5645 }
5646 else if (ch == '\r') {
5647 *p++ = '\\';
5648 *p++ = 'r';
5649 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005650
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005651 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005652 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005654 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005655 *p++ = hexdigits[(ch >> 4) & 0x000F];
5656 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005657 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005658
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 /* Copy everything else as-is */
5660 else
5661 *p++ = (char) ch;
5662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005664 assert(p - PyBytes_AS_STRING(repr) > 0);
5665 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5666 return NULL;
5667 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668}
5669
Alexander Belopolsky40018472011-02-26 01:02:56 +00005670PyObject *
5671PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005673 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 if (!PyUnicode_Check(unicode)) {
5675 PyErr_BadArgument();
5676 return NULL;
5677 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005678 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5679 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005680 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681}
5682
5683/* --- Raw Unicode Escape Codec ------------------------------------------- */
5684
Alexander Belopolsky40018472011-02-26 01:02:56 +00005685PyObject *
5686PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005687 Py_ssize_t size,
5688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005691 Py_ssize_t startinpos;
5692 Py_ssize_t endinpos;
5693 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 const char *end;
5697 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 PyObject *errorHandler = NULL;
5699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005700
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 /* Escaped strings will always be longer than the resulting
5702 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 length after conversion to the true value. (But decoding error
5704 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 v = _PyUnicode_New(size);
5706 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005709 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 end = s + size;
5712 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 unsigned char c;
5714 Py_UCS4 x;
5715 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005716 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 /* Non-escape characters are interpreted as Unicode ordinals */
5719 if (*s != '\\') {
5720 *p++ = (unsigned char)*s++;
5721 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005722 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 startinpos = s-starts;
5724
5725 /* \u-escapes are only interpreted iff the number of leading
5726 backslashes if odd */
5727 bs = s;
5728 for (;s < end;) {
5729 if (*s != '\\')
5730 break;
5731 *p++ = (unsigned char)*s++;
5732 }
5733 if (((s - bs) & 1) == 0 ||
5734 s >= end ||
5735 (*s != 'u' && *s != 'U')) {
5736 continue;
5737 }
5738 p--;
5739 count = *s=='u' ? 4 : 8;
5740 s++;
5741
5742 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5743 outpos = p-PyUnicode_AS_UNICODE(v);
5744 for (x = 0, i = 0; i < count; ++i, ++s) {
5745 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005746 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 endinpos = s-starts;
5748 if (unicode_decode_call_errorhandler(
5749 errors, &errorHandler,
5750 "rawunicodeescape", "truncated \\uXXXX",
5751 &starts, &end, &startinpos, &endinpos, &exc, &s,
5752 &v, &outpos, &p))
5753 goto onError;
5754 goto nextByte;
5755 }
5756 x = (x<<4) & ~0xF;
5757 if (c >= '0' && c <= '9')
5758 x += c - '0';
5759 else if (c >= 'a' && c <= 'f')
5760 x += 10 + c - 'a';
5761 else
5762 x += 10 + c - 'A';
5763 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005764 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 /* UCS-2 character */
5766 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005767 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* UCS-4 character. Either store directly, or as
5769 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005770#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005772#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 x -= 0x10000L;
5774 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5775 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005776#endif
5777 } else {
5778 endinpos = s-starts;
5779 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005780 if (unicode_decode_call_errorhandler(
5781 errors, &errorHandler,
5782 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 &starts, &end, &startinpos, &endinpos, &exc, &s,
5784 &v, &outpos, &p))
5785 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005786 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 nextByte:
5788 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02005790 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 Py_XDECREF(errorHandler);
5793 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 if (PyUnicode_READY(v) == -1) {
5795 Py_DECREF(v);
5796 return NULL;
5797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005799
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 Py_XDECREF(errorHandler);
5803 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804 return NULL;
5805}
5806
Alexander Belopolsky40018472011-02-26 01:02:56 +00005807PyObject *
5808PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005809 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005811 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 char *p;
5813 char *q;
5814
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005815#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005816 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005817#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005818 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005819#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00005820
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005821 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005823
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005824 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 if (repr == NULL)
5826 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005827 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005830 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 while (size-- > 0) {
5832 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005833#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 /* Map 32-bit characters to '\Uxxxxxxxx' */
5835 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005836 *p++ = '\\';
5837 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005838 *p++ = hexdigits[(ch >> 28) & 0xf];
5839 *p++ = hexdigits[(ch >> 24) & 0xf];
5840 *p++ = hexdigits[(ch >> 20) & 0xf];
5841 *p++ = hexdigits[(ch >> 16) & 0xf];
5842 *p++ = hexdigits[(ch >> 12) & 0xf];
5843 *p++ = hexdigits[(ch >> 8) & 0xf];
5844 *p++ = hexdigits[(ch >> 4) & 0xf];
5845 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005846 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005847 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00005848#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005849 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5850 if (ch >= 0xD800 && ch < 0xDC00) {
5851 Py_UNICODE ch2;
5852 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005853
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 ch2 = *s++;
5855 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005856 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5858 *p++ = '\\';
5859 *p++ = 'U';
5860 *p++ = hexdigits[(ucs >> 28) & 0xf];
5861 *p++ = hexdigits[(ucs >> 24) & 0xf];
5862 *p++ = hexdigits[(ucs >> 20) & 0xf];
5863 *p++ = hexdigits[(ucs >> 16) & 0xf];
5864 *p++ = hexdigits[(ucs >> 12) & 0xf];
5865 *p++ = hexdigits[(ucs >> 8) & 0xf];
5866 *p++ = hexdigits[(ucs >> 4) & 0xf];
5867 *p++ = hexdigits[ucs & 0xf];
5868 continue;
5869 }
5870 /* Fall through: isolated surrogates are copied as-is */
5871 s--;
5872 size++;
5873 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005874#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 /* Map 16-bit characters to '\uxxxx' */
5876 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 *p++ = '\\';
5878 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00005879 *p++ = hexdigits[(ch >> 12) & 0xf];
5880 *p++ = hexdigits[(ch >> 8) & 0xf];
5881 *p++ = hexdigits[(ch >> 4) & 0xf];
5882 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* Copy everything else as-is */
5885 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 *p++ = (char) ch;
5887 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005888 size = p - q;
5889
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005890 assert(size > 0);
5891 if (_PyBytes_Resize(&repr, size) < 0)
5892 return NULL;
5893 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894}
5895
Alexander Belopolsky40018472011-02-26 01:02:56 +00005896PyObject *
5897PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005899 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00005901 PyErr_BadArgument();
5902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 }
Walter Dörwald711005d2007-05-12 12:03:26 +00005904 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5905 PyUnicode_GET_SIZE(unicode));
5906
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005907 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908}
5909
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005910/* --- Unicode Internal Codec ------------------------------------------- */
5911
Alexander Belopolsky40018472011-02-26 01:02:56 +00005912PyObject *
5913_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005914 Py_ssize_t size,
5915 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005916{
5917 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t startinpos;
5919 Py_ssize_t endinpos;
5920 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005921 PyUnicodeObject *v;
5922 Py_UNICODE *p;
5923 const char *end;
5924 const char *reason;
5925 PyObject *errorHandler = NULL;
5926 PyObject *exc = NULL;
5927
Neal Norwitzd43069c2006-01-08 01:12:10 +00005928#ifdef Py_UNICODE_WIDE
5929 Py_UNICODE unimax = PyUnicode_GetMax();
5930#endif
5931
Thomas Wouters89f507f2006-12-13 04:49:30 +00005932 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005933 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
5934 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005936 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
5937 as string was created with the old API. */
5938 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005940 p = PyUnicode_AS_UNICODE(v);
5941 end = s + size;
5942
5943 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00005944 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005945 /* We have to sanity check the raw data, otherwise doom looms for
5946 some malformed UCS-4 data. */
5947 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005948#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005949 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005950#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005951 end-s < Py_UNICODE_SIZE
5952 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005954 startinpos = s - starts;
5955 if (end-s < Py_UNICODE_SIZE) {
5956 endinpos = end-starts;
5957 reason = "truncated input";
5958 }
5959 else {
5960 endinpos = s - starts + Py_UNICODE_SIZE;
5961 reason = "illegal code point (> 0x10FFFF)";
5962 }
5963 outpos = p - PyUnicode_AS_UNICODE(v);
5964 if (unicode_decode_call_errorhandler(
5965 errors, &errorHandler,
5966 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00005967 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00005968 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005969 goto onError;
5970 }
5971 }
5972 else {
5973 p++;
5974 s += Py_UNICODE_SIZE;
5975 }
5976 }
5977
Victor Stinnerfe226c02011-10-03 03:52:20 +02005978 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005979 goto onError;
5980 Py_XDECREF(errorHandler);
5981 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982 if (PyUnicode_READY(v) == -1) {
5983 Py_DECREF(v);
5984 return NULL;
5985 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005986 return (PyObject *)v;
5987
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 Py_XDECREF(v);
5990 Py_XDECREF(errorHandler);
5991 Py_XDECREF(exc);
5992 return NULL;
5993}
5994
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995/* --- Latin-1 Codec ------------------------------------------------------ */
5996
Alexander Belopolsky40018472011-02-26 01:02:56 +00005997PyObject *
5998PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005999 Py_ssize_t size,
6000 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006001{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006003 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004}
6005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006007static void
6008make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006009 const char *encoding,
6010 const Py_UNICODE *unicode, Py_ssize_t size,
6011 Py_ssize_t startpos, Py_ssize_t endpos,
6012 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006014 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 *exceptionObject = PyUnicodeEncodeError_Create(
6016 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
6018 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6020 goto onError;
6021 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6022 goto onError;
6023 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6024 goto onError;
6025 return;
6026 onError:
6027 Py_DECREF(*exceptionObject);
6028 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 }
6030}
6031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006033static void
6034raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006035 const char *encoding,
6036 const Py_UNICODE *unicode, Py_ssize_t size,
6037 Py_ssize_t startpos, Py_ssize_t endpos,
6038 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039{
6040 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044}
6045
6046/* error handling callback helper:
6047 build arguments, call the callback and check the arguments,
6048 put the result into newpos and return the replacement string, which
6049 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050static PyObject *
6051unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006052 PyObject **errorHandler,
6053 const char *encoding, const char *reason,
6054 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6055 Py_ssize_t startpos, Py_ssize_t endpos,
6056 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006058 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059
6060 PyObject *restuple;
6061 PyObject *resunicode;
6062
6063 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 }
6068
6069 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006070 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006073
6074 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006079 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 Py_DECREF(restuple);
6081 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006083 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 &resunicode, newpos)) {
6085 Py_DECREF(restuple);
6086 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006088 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6089 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6090 Py_DECREF(restuple);
6091 return NULL;
6092 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006095 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6097 Py_DECREF(restuple);
6098 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006100 Py_INCREF(resunicode);
6101 Py_DECREF(restuple);
6102 return resunicode;
6103}
6104
Alexander Belopolsky40018472011-02-26 01:02:56 +00006105static PyObject *
6106unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006107 Py_ssize_t size,
6108 const char *errors,
6109 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110{
6111 /* output object */
6112 PyObject *res;
6113 /* pointers to the beginning and end+1 of input */
6114 const Py_UNICODE *startp = p;
6115 const Py_UNICODE *endp = p + size;
6116 /* pointer to the beginning of the unencodable characters */
6117 /* const Py_UNICODE *badp = NULL; */
6118 /* pointer into the output */
6119 char *str;
6120 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006122 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6123 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 PyObject *errorHandler = NULL;
6125 PyObject *exc = NULL;
6126 /* the following variable is used for caching string comparisons
6127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6128 int known_errorHandler = -1;
6129
6130 /* allocate enough for a simple encoding without
6131 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006132 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006133 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006134 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006136 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006137 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 ressize = size;
6139
6140 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 /* can we encode this? */
6144 if (c<limit) {
6145 /* no overflow check, because we know that the space is enough */
6146 *str++ = (char)c;
6147 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006148 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 else {
6150 Py_ssize_t unicodepos = p-startp;
6151 Py_ssize_t requiredsize;
6152 PyObject *repunicode;
6153 Py_ssize_t repsize;
6154 Py_ssize_t newpos;
6155 Py_ssize_t respos;
6156 Py_UNICODE *uni2;
6157 /* startpos for collecting unencodable chars */
6158 const Py_UNICODE *collstart = p;
6159 const Py_UNICODE *collend = p;
6160 /* find all unecodable characters */
6161 while ((collend < endp) && ((*collend)>=limit))
6162 ++collend;
6163 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6164 if (known_errorHandler==-1) {
6165 if ((errors==NULL) || (!strcmp(errors, "strict")))
6166 known_errorHandler = 1;
6167 else if (!strcmp(errors, "replace"))
6168 known_errorHandler = 2;
6169 else if (!strcmp(errors, "ignore"))
6170 known_errorHandler = 3;
6171 else if (!strcmp(errors, "xmlcharrefreplace"))
6172 known_errorHandler = 4;
6173 else
6174 known_errorHandler = 0;
6175 }
6176 switch (known_errorHandler) {
6177 case 1: /* strict */
6178 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6179 goto onError;
6180 case 2: /* replace */
6181 while (collstart++<collend)
6182 *str++ = '?'; /* fall through */
6183 case 3: /* ignore */
6184 p = collend;
6185 break;
6186 case 4: /* xmlcharrefreplace */
6187 respos = str - PyBytes_AS_STRING(res);
6188 /* determine replacement size (temporarily (mis)uses p) */
6189 for (p = collstart, repsize = 0; p < collend; ++p) {
6190 if (*p<10)
6191 repsize += 2+1+1;
6192 else if (*p<100)
6193 repsize += 2+2+1;
6194 else if (*p<1000)
6195 repsize += 2+3+1;
6196 else if (*p<10000)
6197 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006198#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 else
6200 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006201#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 else if (*p<100000)
6203 repsize += 2+5+1;
6204 else if (*p<1000000)
6205 repsize += 2+6+1;
6206 else
6207 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006208#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 }
6210 requiredsize = respos+repsize+(endp-collend);
6211 if (requiredsize > ressize) {
6212 if (requiredsize<2*ressize)
6213 requiredsize = 2*ressize;
6214 if (_PyBytes_Resize(&res, requiredsize))
6215 goto onError;
6216 str = PyBytes_AS_STRING(res) + respos;
6217 ressize = requiredsize;
6218 }
6219 /* generate replacement (temporarily (mis)uses p) */
6220 for (p = collstart; p < collend; ++p) {
6221 str += sprintf(str, "&#%d;", (int)*p);
6222 }
6223 p = collend;
6224 break;
6225 default:
6226 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6227 encoding, reason, startp, size, &exc,
6228 collstart-startp, collend-startp, &newpos);
6229 if (repunicode == NULL)
6230 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006231 if (PyBytes_Check(repunicode)) {
6232 /* Directly copy bytes result to output. */
6233 repsize = PyBytes_Size(repunicode);
6234 if (repsize > 1) {
6235 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006236 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006237 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6238 Py_DECREF(repunicode);
6239 goto onError;
6240 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006241 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006242 ressize += repsize-1;
6243 }
6244 memcpy(str, PyBytes_AsString(repunicode), repsize);
6245 str += repsize;
6246 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006247 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006248 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006249 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 /* need more space? (at least enough for what we
6251 have+the replacement+the rest of the string, so
6252 we won't have to check space for encodable characters) */
6253 respos = str - PyBytes_AS_STRING(res);
6254 repsize = PyUnicode_GET_SIZE(repunicode);
6255 requiredsize = respos+repsize+(endp-collend);
6256 if (requiredsize > ressize) {
6257 if (requiredsize<2*ressize)
6258 requiredsize = 2*ressize;
6259 if (_PyBytes_Resize(&res, requiredsize)) {
6260 Py_DECREF(repunicode);
6261 goto onError;
6262 }
6263 str = PyBytes_AS_STRING(res) + respos;
6264 ressize = requiredsize;
6265 }
6266 /* check if there is anything unencodable in the replacement
6267 and copy it to the output */
6268 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6269 c = *uni2;
6270 if (c >= limit) {
6271 raise_encode_exception(&exc, encoding, startp, size,
6272 unicodepos, unicodepos+1, reason);
6273 Py_DECREF(repunicode);
6274 goto onError;
6275 }
6276 *str = (char)c;
6277 }
6278 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006279 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006281 }
6282 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006283 /* Resize if we allocated to much */
6284 size = str - PyBytes_AS_STRING(res);
6285 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006286 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006287 if (_PyBytes_Resize(&res, size) < 0)
6288 goto onError;
6289 }
6290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 Py_XDECREF(errorHandler);
6292 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006293 return res;
6294
6295 onError:
6296 Py_XDECREF(res);
6297 Py_XDECREF(errorHandler);
6298 Py_XDECREF(exc);
6299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300}
6301
Alexander Belopolsky40018472011-02-26 01:02:56 +00006302PyObject *
6303PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006304 Py_ssize_t size,
6305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308}
6309
Alexander Belopolsky40018472011-02-26 01:02:56 +00006310PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006311_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312{
6313 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 PyErr_BadArgument();
6315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006317 if (PyUnicode_READY(unicode) == -1)
6318 return NULL;
6319 /* Fast path: if it is a one-byte string, construct
6320 bytes object directly. */
6321 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6322 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6323 PyUnicode_GET_LENGTH(unicode));
6324 /* Non-Latin-1 characters present. Defer to above function to
6325 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006328 errors);
6329}
6330
6331PyObject*
6332PyUnicode_AsLatin1String(PyObject *unicode)
6333{
6334 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335}
6336
6337/* --- 7-bit ASCII Codec -------------------------------------------------- */
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
6340PyUnicode_DecodeASCII(const char *s,
6341 Py_ssize_t size,
6342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 PyUnicodeObject *v;
6346 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347 Py_ssize_t startinpos;
6348 Py_ssize_t endinpos;
6349 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350 const char *e;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006351 unsigned char* d;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 PyObject *errorHandler = NULL;
6353 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006354 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00006355
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006357 if (size == 1 && *(unsigned char*)s < 128)
6358 return PyUnicode_FromOrdinal(*(unsigned char*)s);
6359
6360 /* Fast path. Assume the input actually *is* ASCII, and allocate
6361 a single-block Unicode object with that assumption. If there is
6362 an error, drop the object and start over. */
6363 v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6364 if (v == NULL)
6365 goto onError;
6366 d = PyUnicode_1BYTE_DATA(v);
6367 for (i = 0; i < size; i++) {
6368 unsigned char ch = ((unsigned char*)s)[i];
6369 if (ch < 128)
6370 d[i] = ch;
6371 else
6372 break;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006374 if (i == size)
6375 return (PyObject*)v;
6376 Py_DECREF(v); /* start over */
Tim Petersced69f82003-09-16 20:30:58 +00006377
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 v = _PyUnicode_New(size);
6379 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 e = s + size;
6385 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 register unsigned char c = (unsigned char)*s;
6387 if (c < 128) {
6388 *p++ = c;
6389 ++s;
6390 }
6391 else {
6392 startinpos = s-starts;
6393 endinpos = startinpos + 1;
6394 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6395 if (unicode_decode_call_errorhandler(
6396 errors, &errorHandler,
6397 "ascii", "ordinal not in range(128)",
6398 &starts, &e, &startinpos, &endinpos, &exc, &s,
6399 &v, &outpos, &p))
6400 goto onError;
6401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00006403 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006404 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 Py_XDECREF(errorHandler);
6407 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408 if (PyUnicode_READY(v) == -1) {
6409 Py_DECREF(v);
6410 return NULL;
6411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006413
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 Py_XDECREF(errorHandler);
6417 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 return NULL;
6419}
6420
Alexander Belopolsky40018472011-02-26 01:02:56 +00006421PyObject *
6422PyUnicode_EncodeASCII(const Py_UNICODE *p,
6423 Py_ssize_t size,
6424 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006430_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
6432 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 PyErr_BadArgument();
6434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006436 if (PyUnicode_READY(unicode) == -1)
6437 return NULL;
6438 /* Fast path: if it is an ASCII-only string, construct bytes object
6439 directly. Else defer to above function to raise the exception. */
6440 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6441 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6442 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006445 errors);
6446}
6447
6448PyObject *
6449PyUnicode_AsASCIIString(PyObject *unicode)
6450{
6451 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452}
6453
Victor Stinner99b95382011-07-04 14:23:54 +02006454#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006455
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006456/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006457
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006458#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006459#define NEED_RETRY
6460#endif
6461
6462/* XXX This code is limited to "true" double-byte encodings, as
6463 a) it assumes an incomplete character consists of a single byte, and
6464 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006466
Alexander Belopolsky40018472011-02-26 01:02:56 +00006467static int
6468is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006469{
6470 const char *curr = s + offset;
6471
6472 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 const char *prev = CharPrev(s, curr);
6474 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006475 }
6476 return 0;
6477}
6478
6479/*
6480 * Decode MBCS string into unicode object. If 'final' is set, converts
6481 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6482 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006483static int
6484decode_mbcs(PyUnicodeObject **v,
6485 const char *s, /* MBCS string */
6486 int size, /* sizeof MBCS string */
6487 int final,
6488 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006489{
6490 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006491 Py_ssize_t n;
6492 DWORD usize;
6493 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006494
6495 assert(size >= 0);
6496
Victor Stinner554f3f02010-06-16 23:33:54 +00006497 /* check and handle 'errors' arg */
6498 if (errors==NULL || strcmp(errors, "strict")==0)
6499 flags = MB_ERR_INVALID_CHARS;
6500 else if (strcmp(errors, "ignore")==0)
6501 flags = 0;
6502 else {
6503 PyErr_Format(PyExc_ValueError,
6504 "mbcs encoding does not support errors='%s'",
6505 errors);
6506 return -1;
6507 }
6508
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006509 /* Skip trailing lead-byte unless 'final' is set */
6510 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006512
6513 /* First get the size of the result */
6514 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006515 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6516 if (usize==0)
6517 goto mbcs_decode_error;
6518 } else
6519 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006520
6521 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 /* Create unicode object */
6523 *v = _PyUnicode_New(usize);
6524 if (*v == NULL)
6525 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006526 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006527 }
6528 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 /* Extend unicode object */
6530 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006531 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533 }
6534
6535 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006536 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006538 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6539 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006541 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006542 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006543
6544mbcs_decode_error:
6545 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6546 we raise a UnicodeDecodeError - else it is a 'generic'
6547 windows error
6548 */
6549 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6550 /* Ideally, we should get reason from FormatMessage - this
6551 is the Windows 2000 English version of the message
6552 */
6553 PyObject *exc = NULL;
6554 const char *reason = "No mapping for the Unicode character exists "
6555 "in the target multi-byte code page.";
6556 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6557 if (exc != NULL) {
6558 PyCodec_StrictErrors(exc);
6559 Py_DECREF(exc);
6560 }
6561 } else {
6562 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6563 }
6564 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006565}
6566
Alexander Belopolsky40018472011-02-26 01:02:56 +00006567PyObject *
6568PyUnicode_DecodeMBCSStateful(const char *s,
6569 Py_ssize_t size,
6570 const char *errors,
6571 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006572{
6573 PyUnicodeObject *v = NULL;
6574 int done;
6575
6576 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006578
6579#ifdef NEED_RETRY
6580 retry:
6581 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006582 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583 else
6584#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006585 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006586
6587 if (done < 0) {
6588 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590 }
6591
6592 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006594
6595#ifdef NEED_RETRY
6596 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 s += done;
6598 size -= done;
6599 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600 }
6601#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006602 if (PyUnicode_READY(v) == -1) {
6603 Py_DECREF(v);
6604 return NULL;
6605 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606 return (PyObject *)v;
6607}
6608
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609PyObject *
6610PyUnicode_DecodeMBCS(const char *s,
6611 Py_ssize_t size,
6612 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006613{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006614 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6615}
6616
6617/*
6618 * Convert unicode into string object (MBCS).
6619 * Returns 0 if succeed, -1 otherwise.
6620 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621static int
6622encode_mbcs(PyObject **repr,
6623 const Py_UNICODE *p, /* unicode */
6624 int size, /* size of unicode */
6625 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006626{
Victor Stinner554f3f02010-06-16 23:33:54 +00006627 BOOL usedDefaultChar = FALSE;
6628 BOOL *pusedDefaultChar;
6629 int mbcssize;
6630 Py_ssize_t n;
6631 PyObject *exc = NULL;
6632 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006633
6634 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006635
Victor Stinner554f3f02010-06-16 23:33:54 +00006636 /* check and handle 'errors' arg */
6637 if (errors==NULL || strcmp(errors, "strict")==0) {
6638 flags = WC_NO_BEST_FIT_CHARS;
6639 pusedDefaultChar = &usedDefaultChar;
6640 } else if (strcmp(errors, "replace")==0) {
6641 flags = 0;
6642 pusedDefaultChar = NULL;
6643 } else {
6644 PyErr_Format(PyExc_ValueError,
6645 "mbcs encoding does not support errors='%s'",
6646 errors);
6647 return -1;
6648 }
6649
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006650 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006652 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6653 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 if (mbcssize == 0) {
6655 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6656 return -1;
6657 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006658 /* If we used a default char, then we failed! */
6659 if (pusedDefaultChar && *pusedDefaultChar)
6660 goto mbcs_encode_error;
6661 } else {
6662 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006663 }
6664
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006665 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 /* Create string object */
6667 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6668 if (*repr == NULL)
6669 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006670 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006671 }
6672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 /* Extend string object */
6674 n = PyBytes_Size(*repr);
6675 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6676 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677 }
6678
6679 /* Do the conversion */
6680 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006682 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6683 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6685 return -1;
6686 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006687 if (pusedDefaultChar && *pusedDefaultChar)
6688 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006689 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006690 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006691
6692mbcs_encode_error:
6693 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6694 Py_XDECREF(exc);
6695 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006696}
6697
Alexander Belopolsky40018472011-02-26 01:02:56 +00006698PyObject *
6699PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6700 Py_ssize_t size,
6701 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006702{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703 PyObject *repr = NULL;
6704 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006705
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006709 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710 else
6711#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006712 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006713
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 Py_XDECREF(repr);
6716 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006717 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718
6719#ifdef NEED_RETRY
6720 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 p += INT_MAX;
6722 size -= INT_MAX;
6723 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006724 }
6725#endif
6726
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006727 return repr;
6728}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006729
Alexander Belopolsky40018472011-02-26 01:02:56 +00006730PyObject *
6731PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006732{
6733 if (!PyUnicode_Check(unicode)) {
6734 PyErr_BadArgument();
6735 return NULL;
6736 }
6737 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 PyUnicode_GET_SIZE(unicode),
6739 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006740}
6741
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006742#undef NEED_RETRY
6743
Victor Stinner99b95382011-07-04 14:23:54 +02006744#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006745
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746/* --- Character Mapping Codec -------------------------------------------- */
6747
Alexander Belopolsky40018472011-02-26 01:02:56 +00006748PyObject *
6749PyUnicode_DecodeCharmap(const char *s,
6750 Py_ssize_t size,
6751 PyObject *mapping,
6752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006755 Py_ssize_t startinpos;
6756 Py_ssize_t endinpos;
6757 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 PyUnicodeObject *v;
6760 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 PyObject *errorHandler = NULL;
6763 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006764 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006765 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006766
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 /* Default to Latin-1 */
6768 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770
6771 v = _PyUnicode_New(size);
6772 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006778 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 mapstring = PyUnicode_AS_UNICODE(mapping);
6780 maplen = PyUnicode_GET_SIZE(mapping);
6781 while (s < e) {
6782 unsigned char ch = *s;
6783 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 if (ch < maplen)
6786 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 if (x == 0xfffe) {
6789 /* undefined mapping */
6790 outpos = p-PyUnicode_AS_UNICODE(v);
6791 startinpos = s-starts;
6792 endinpos = startinpos+1;
6793 if (unicode_decode_call_errorhandler(
6794 errors, &errorHandler,
6795 "charmap", "character maps to <undefined>",
6796 &starts, &e, &startinpos, &endinpos, &exc, &s,
6797 &v, &outpos, &p)) {
6798 goto onError;
6799 }
6800 continue;
6801 }
6802 *p++ = x;
6803 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006804 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006805 }
6806 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 while (s < e) {
6808 unsigned char ch = *s;
6809 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00006810
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 /* Get mapping (char ordinal -> integer, Unicode char or None) */
6812 w = PyLong_FromLong((long)ch);
6813 if (w == NULL)
6814 goto onError;
6815 x = PyObject_GetItem(mapping, w);
6816 Py_DECREF(w);
6817 if (x == NULL) {
6818 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6819 /* No mapping found means: mapping is undefined. */
6820 PyErr_Clear();
6821 x = Py_None;
6822 Py_INCREF(x);
6823 } else
6824 goto onError;
6825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006826
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 /* Apply mapping */
6828 if (PyLong_Check(x)) {
6829 long value = PyLong_AS_LONG(x);
6830 if (value < 0 || value > 65535) {
6831 PyErr_SetString(PyExc_TypeError,
6832 "character mapping must be in range(65536)");
6833 Py_DECREF(x);
6834 goto onError;
6835 }
6836 *p++ = (Py_UNICODE)value;
6837 }
6838 else if (x == Py_None) {
6839 /* undefined mapping */
6840 outpos = p-PyUnicode_AS_UNICODE(v);
6841 startinpos = s-starts;
6842 endinpos = startinpos+1;
6843 if (unicode_decode_call_errorhandler(
6844 errors, &errorHandler,
6845 "charmap", "character maps to <undefined>",
6846 &starts, &e, &startinpos, &endinpos, &exc, &s,
6847 &v, &outpos, &p)) {
6848 Py_DECREF(x);
6849 goto onError;
6850 }
6851 Py_DECREF(x);
6852 continue;
6853 }
6854 else if (PyUnicode_Check(x)) {
6855 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006856
Benjamin Peterson29060642009-01-31 22:14:21 +00006857 if (targetsize == 1)
6858 /* 1-1 mapping */
6859 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006860
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 else if (targetsize > 1) {
6862 /* 1-n mapping */
6863 if (targetsize > extrachars) {
6864 /* resize first */
6865 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6866 Py_ssize_t needed = (targetsize - extrachars) + \
6867 (targetsize << 2);
6868 extrachars += needed;
6869 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02006870 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00006871 PyUnicode_GET_SIZE(v) + needed) < 0) {
6872 Py_DECREF(x);
6873 goto onError;
6874 }
6875 p = PyUnicode_AS_UNICODE(v) + oldpos;
6876 }
6877 Py_UNICODE_COPY(p,
6878 PyUnicode_AS_UNICODE(x),
6879 targetsize);
6880 p += targetsize;
6881 extrachars -= targetsize;
6882 }
6883 /* 1-0 mapping: skip the character */
6884 }
6885 else {
6886 /* wrong return value */
6887 PyErr_SetString(PyExc_TypeError,
6888 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006889 Py_DECREF(x);
6890 goto onError;
6891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 Py_DECREF(x);
6893 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895 }
6896 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02006897 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899 Py_XDECREF(errorHandler);
6900 Py_XDECREF(exc);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006901 if (PyUnicode_READY(v) == -1) {
6902 Py_DECREF(v);
6903 return NULL;
6904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006906
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006908 Py_XDECREF(errorHandler);
6909 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 Py_XDECREF(v);
6911 return NULL;
6912}
6913
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006914/* Charmap encoding: the lookup table */
6915
Alexander Belopolsky40018472011-02-26 01:02:56 +00006916struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 PyObject_HEAD
6918 unsigned char level1[32];
6919 int count2, count3;
6920 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006921};
6922
6923static PyObject*
6924encoding_map_size(PyObject *obj, PyObject* args)
6925{
6926 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006929}
6930
6931static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006932 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 PyDoc_STR("Return the size (in bytes) of this object") },
6934 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006935};
6936
6937static void
6938encoding_map_dealloc(PyObject* o)
6939{
Benjamin Peterson14339b62009-01-31 16:36:08 +00006940 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006941}
6942
6943static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006944 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 "EncodingMap", /*tp_name*/
6946 sizeof(struct encoding_map), /*tp_basicsize*/
6947 0, /*tp_itemsize*/
6948 /* methods */
6949 encoding_map_dealloc, /*tp_dealloc*/
6950 0, /*tp_print*/
6951 0, /*tp_getattr*/
6952 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00006953 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 0, /*tp_repr*/
6955 0, /*tp_as_number*/
6956 0, /*tp_as_sequence*/
6957 0, /*tp_as_mapping*/
6958 0, /*tp_hash*/
6959 0, /*tp_call*/
6960 0, /*tp_str*/
6961 0, /*tp_getattro*/
6962 0, /*tp_setattro*/
6963 0, /*tp_as_buffer*/
6964 Py_TPFLAGS_DEFAULT, /*tp_flags*/
6965 0, /*tp_doc*/
6966 0, /*tp_traverse*/
6967 0, /*tp_clear*/
6968 0, /*tp_richcompare*/
6969 0, /*tp_weaklistoffset*/
6970 0, /*tp_iter*/
6971 0, /*tp_iternext*/
6972 encoding_map_methods, /*tp_methods*/
6973 0, /*tp_members*/
6974 0, /*tp_getset*/
6975 0, /*tp_base*/
6976 0, /*tp_dict*/
6977 0, /*tp_descr_get*/
6978 0, /*tp_descr_set*/
6979 0, /*tp_dictoffset*/
6980 0, /*tp_init*/
6981 0, /*tp_alloc*/
6982 0, /*tp_new*/
6983 0, /*tp_free*/
6984 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006985};
6986
6987PyObject*
6988PyUnicode_BuildEncodingMap(PyObject* string)
6989{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00006990 PyObject *result;
6991 struct encoding_map *mresult;
6992 int i;
6993 int need_dict = 0;
6994 unsigned char level1[32];
6995 unsigned char level2[512];
6996 unsigned char *mlevel1, *mlevel2, *mlevel3;
6997 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006998 int kind;
6999 void *data;
7000 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007002 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007003 PyErr_BadArgument();
7004 return NULL;
7005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007006 kind = PyUnicode_KIND(string);
7007 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007008 memset(level1, 0xFF, sizeof level1);
7009 memset(level2, 0xFF, sizeof level2);
7010
7011 /* If there isn't a one-to-one mapping of NULL to \0,
7012 or if there are non-BMP characters, we need to use
7013 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007014 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007015 need_dict = 1;
7016 for (i = 1; i < 256; i++) {
7017 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007018 ch = PyUnicode_READ(kind, data, i);
7019 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007020 need_dict = 1;
7021 break;
7022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007023 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007024 /* unmapped character */
7025 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007026 l1 = ch >> 11;
7027 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007028 if (level1[l1] == 0xFF)
7029 level1[l1] = count2++;
7030 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007031 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007032 }
7033
7034 if (count2 >= 0xFF || count3 >= 0xFF)
7035 need_dict = 1;
7036
7037 if (need_dict) {
7038 PyObject *result = PyDict_New();
7039 PyObject *key, *value;
7040 if (!result)
7041 return NULL;
7042 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007043 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007044 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007045 if (!key || !value)
7046 goto failed1;
7047 if (PyDict_SetItem(result, key, value) == -1)
7048 goto failed1;
7049 Py_DECREF(key);
7050 Py_DECREF(value);
7051 }
7052 return result;
7053 failed1:
7054 Py_XDECREF(key);
7055 Py_XDECREF(value);
7056 Py_DECREF(result);
7057 return NULL;
7058 }
7059
7060 /* Create a three-level trie */
7061 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7062 16*count2 + 128*count3 - 1);
7063 if (!result)
7064 return PyErr_NoMemory();
7065 PyObject_Init(result, &EncodingMapType);
7066 mresult = (struct encoding_map*)result;
7067 mresult->count2 = count2;
7068 mresult->count3 = count3;
7069 mlevel1 = mresult->level1;
7070 mlevel2 = mresult->level23;
7071 mlevel3 = mresult->level23 + 16*count2;
7072 memcpy(mlevel1, level1, 32);
7073 memset(mlevel2, 0xFF, 16*count2);
7074 memset(mlevel3, 0, 128*count3);
7075 count3 = 0;
7076 for (i = 1; i < 256; i++) {
7077 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007078 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007079 /* unmapped character */
7080 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007081 o1 = PyUnicode_READ(kind, data, i)>>11;
7082 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007083 i2 = 16*mlevel1[o1] + o2;
7084 if (mlevel2[i2] == 0xFF)
7085 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007087 i3 = 128*mlevel2[i2] + o3;
7088 mlevel3[i3] = i;
7089 }
7090 return result;
7091}
7092
7093static int
7094encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7095{
7096 struct encoding_map *map = (struct encoding_map*)mapping;
7097 int l1 = c>>11;
7098 int l2 = (c>>7) & 0xF;
7099 int l3 = c & 0x7F;
7100 int i;
7101
7102#ifdef Py_UNICODE_WIDE
7103 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007105 }
7106#endif
7107 if (c == 0)
7108 return 0;
7109 /* level 1*/
7110 i = map->level1[l1];
7111 if (i == 0xFF) {
7112 return -1;
7113 }
7114 /* level 2*/
7115 i = map->level23[16*i+l2];
7116 if (i == 0xFF) {
7117 return -1;
7118 }
7119 /* level 3 */
7120 i = map->level23[16*map->count2 + 128*i + l3];
7121 if (i == 0) {
7122 return -1;
7123 }
7124 return i;
7125}
7126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007127/* Lookup the character ch in the mapping. If the character
7128 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007129 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007130static PyObject *
7131charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
Christian Heimes217cfd12007-12-02 14:31:20 +00007133 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007134 PyObject *x;
7135
7136 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007138 x = PyObject_GetItem(mapping, w);
7139 Py_DECREF(w);
7140 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7142 /* No mapping found means: mapping is undefined. */
7143 PyErr_Clear();
7144 x = Py_None;
7145 Py_INCREF(x);
7146 return x;
7147 } else
7148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007150 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007152 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007153 long value = PyLong_AS_LONG(x);
7154 if (value < 0 || value > 255) {
7155 PyErr_SetString(PyExc_TypeError,
7156 "character mapping must be in range(256)");
7157 Py_DECREF(x);
7158 return NULL;
7159 }
7160 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007161 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007162 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 /* wrong return value */
7166 PyErr_Format(PyExc_TypeError,
7167 "character mapping must return integer, bytes or None, not %.400s",
7168 x->ob_type->tp_name);
7169 Py_DECREF(x);
7170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 }
7172}
7173
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007174static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007175charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007176{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007177 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7178 /* exponentially overallocate to minimize reallocations */
7179 if (requiredsize < 2*outsize)
7180 requiredsize = 2*outsize;
7181 if (_PyBytes_Resize(outobj, requiredsize))
7182 return -1;
7183 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007184}
7185
Benjamin Peterson14339b62009-01-31 16:36:08 +00007186typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007188} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007189/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007190 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 space is available. Return a new reference to the object that
7192 was put in the output buffer, or Py_None, if the mapping was undefined
7193 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007194 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007195static charmapencode_result
7196charmapencode_output(Py_UNICODE c, PyObject *mapping,
7197 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007199 PyObject *rep;
7200 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007201 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202
Christian Heimes90aa7642007-12-19 02:45:37 +00007203 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007204 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007205 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007206 if (res == -1)
7207 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 if (outsize<requiredsize)
7209 if (charmapencode_resize(outobj, outpos, requiredsize))
7210 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007211 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 outstart[(*outpos)++] = (char)res;
7213 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007214 }
7215
7216 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007219 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 Py_DECREF(rep);
7221 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007222 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 if (PyLong_Check(rep)) {
7224 Py_ssize_t requiredsize = *outpos+1;
7225 if (outsize<requiredsize)
7226 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7227 Py_DECREF(rep);
7228 return enc_EXCEPTION;
7229 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007230 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007232 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 else {
7234 const char *repchars = PyBytes_AS_STRING(rep);
7235 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7236 Py_ssize_t requiredsize = *outpos+repsize;
7237 if (outsize<requiredsize)
7238 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7239 Py_DECREF(rep);
7240 return enc_EXCEPTION;
7241 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007242 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 memcpy(outstart + *outpos, repchars, repsize);
7244 *outpos += repsize;
7245 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007246 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007247 Py_DECREF(rep);
7248 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007249}
7250
7251/* handle an error in PyUnicode_EncodeCharmap
7252 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007253static int
7254charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007255 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007257 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007258 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007259{
7260 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t repsize;
7262 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263 Py_UNICODE *uni2;
7264 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007265 Py_ssize_t collstartpos = *inpos;
7266 Py_ssize_t collendpos = *inpos+1;
7267 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007268 char *encoding = "charmap";
7269 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007270 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007272 /* find all unencodable characters */
7273 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007274 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007275 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 int res = encoding_map_lookup(p[collendpos], mapping);
7277 if (res != -1)
7278 break;
7279 ++collendpos;
7280 continue;
7281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007282
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 rep = charmapencode_lookup(p[collendpos], mapping);
7284 if (rep==NULL)
7285 return -1;
7286 else if (rep!=Py_None) {
7287 Py_DECREF(rep);
7288 break;
7289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007290 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292 }
7293 /* cache callback name lookup
7294 * (if not done yet, i.e. it's the first error) */
7295 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 if ((errors==NULL) || (!strcmp(errors, "strict")))
7297 *known_errorHandler = 1;
7298 else if (!strcmp(errors, "replace"))
7299 *known_errorHandler = 2;
7300 else if (!strcmp(errors, "ignore"))
7301 *known_errorHandler = 3;
7302 else if (!strcmp(errors, "xmlcharrefreplace"))
7303 *known_errorHandler = 4;
7304 else
7305 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007306 }
7307 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007308 case 1: /* strict */
7309 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7310 return -1;
7311 case 2: /* replace */
7312 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 x = charmapencode_output('?', mapping, res, respos);
7314 if (x==enc_EXCEPTION) {
7315 return -1;
7316 }
7317 else if (x==enc_FAILED) {
7318 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7319 return -1;
7320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007321 }
7322 /* fall through */
7323 case 3: /* ignore */
7324 *inpos = collendpos;
7325 break;
7326 case 4: /* xmlcharrefreplace */
7327 /* generate replacement (temporarily (mis)uses p) */
7328 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007329 char buffer[2+29+1+1];
7330 char *cp;
7331 sprintf(buffer, "&#%d;", (int)p[collpos]);
7332 for (cp = buffer; *cp; ++cp) {
7333 x = charmapencode_output(*cp, mapping, res, respos);
7334 if (x==enc_EXCEPTION)
7335 return -1;
7336 else if (x==enc_FAILED) {
7337 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7338 return -1;
7339 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007340 }
7341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 *inpos = collendpos;
7343 break;
7344 default:
7345 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 encoding, reason, p, size, exceptionObject,
7347 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007348 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007350 if (PyBytes_Check(repunicode)) {
7351 /* Directly copy bytes result to output. */
7352 Py_ssize_t outsize = PyBytes_Size(*res);
7353 Py_ssize_t requiredsize;
7354 repsize = PyBytes_Size(repunicode);
7355 requiredsize = *respos + repsize;
7356 if (requiredsize > outsize)
7357 /* Make room for all additional bytes. */
7358 if (charmapencode_resize(res, respos, requiredsize)) {
7359 Py_DECREF(repunicode);
7360 return -1;
7361 }
7362 memcpy(PyBytes_AsString(*res) + *respos,
7363 PyBytes_AsString(repunicode), repsize);
7364 *respos += repsize;
7365 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007366 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007367 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007369 /* generate replacement */
7370 repsize = PyUnicode_GET_SIZE(repunicode);
7371 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 x = charmapencode_output(*uni2, mapping, res, respos);
7373 if (x==enc_EXCEPTION) {
7374 return -1;
7375 }
7376 else if (x==enc_FAILED) {
7377 Py_DECREF(repunicode);
7378 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7379 return -1;
7380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007381 }
7382 *inpos = newpos;
7383 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384 }
7385 return 0;
7386}
7387
Alexander Belopolsky40018472011-02-26 01:02:56 +00007388PyObject *
7389PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7390 Py_ssize_t size,
7391 PyObject *mapping,
7392 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 /* output object */
7395 PyObject *res = NULL;
7396 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007397 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007398 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007399 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007400 PyObject *errorHandler = NULL;
7401 PyObject *exc = NULL;
7402 /* the following variable is used for caching string comparisons
7403 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7404 * 3=ignore, 4=xmlcharrefreplace */
7405 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
7407 /* Default to Latin-1 */
7408 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007411 /* allocate enough for a simple encoding without
7412 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007413 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007414 if (res == NULL)
7415 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007416 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007419 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 /* try to encode it */
7421 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7422 if (x==enc_EXCEPTION) /* error */
7423 goto onError;
7424 if (x==enc_FAILED) { /* unencodable character */
7425 if (charmap_encoding_error(p, size, &inpos, mapping,
7426 &exc,
7427 &known_errorHandler, &errorHandler, errors,
7428 &res, &respos)) {
7429 goto onError;
7430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 else
7433 /* done with this character => adjust input position */
7434 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007437 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007438 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007439 if (_PyBytes_Resize(&res, respos) < 0)
7440 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 Py_XDECREF(exc);
7443 Py_XDECREF(errorHandler);
7444 return res;
7445
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007447 Py_XDECREF(res);
7448 Py_XDECREF(exc);
7449 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 return NULL;
7451}
7452
Alexander Belopolsky40018472011-02-26 01:02:56 +00007453PyObject *
7454PyUnicode_AsCharmapString(PyObject *unicode,
7455 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456{
7457 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 PyErr_BadArgument();
7459 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 }
7461 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 PyUnicode_GET_SIZE(unicode),
7463 mapping,
7464 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007465}
7466
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007467/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007468static void
7469make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007470 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007471 Py_ssize_t startpos, Py_ssize_t endpos,
7472 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007474 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007475 *exceptionObject = _PyUnicodeTranslateError_Create(
7476 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007477 }
7478 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7480 goto onError;
7481 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7482 goto onError;
7483 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7484 goto onError;
7485 return;
7486 onError:
7487 Py_DECREF(*exceptionObject);
7488 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 }
7490}
7491
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007492/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007493static void
7494raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007495 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007496 Py_ssize_t startpos, Py_ssize_t endpos,
7497 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007498{
7499 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007500 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007501 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503}
7504
7505/* error handling callback helper:
7506 build arguments, call the callback and check the arguments,
7507 put the result into newpos and return the replacement string, which
7508 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007509static PyObject *
7510unicode_translate_call_errorhandler(const char *errors,
7511 PyObject **errorHandler,
7512 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007513 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007514 Py_ssize_t startpos, Py_ssize_t endpos,
7515 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007516{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007517 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007518
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007519 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007520 PyObject *restuple;
7521 PyObject *resunicode;
7522
7523 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007525 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007527 }
7528
7529 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007530 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007531 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007533
7534 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007536 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007538 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007539 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007540 Py_DECREF(restuple);
7541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007542 }
7543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 &resunicode, &i_newpos)) {
7545 Py_DECREF(restuple);
7546 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007547 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007548 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007549 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007550 else
7551 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007552 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7554 Py_DECREF(restuple);
7555 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007556 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007557 Py_INCREF(resunicode);
7558 Py_DECREF(restuple);
7559 return resunicode;
7560}
7561
7562/* Lookup the character ch in the mapping and put the result in result,
7563 which must be decrefed by the caller.
7564 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007565static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007566charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007567{
Christian Heimes217cfd12007-12-02 14:31:20 +00007568 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569 PyObject *x;
7570
7571 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007573 x = PyObject_GetItem(mapping, w);
7574 Py_DECREF(w);
7575 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7577 /* No mapping found means: use 1:1 mapping. */
7578 PyErr_Clear();
7579 *result = NULL;
7580 return 0;
7581 } else
7582 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007583 }
7584 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 *result = x;
7586 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007587 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007588 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 long value = PyLong_AS_LONG(x);
7590 long max = PyUnicode_GetMax();
7591 if (value < 0 || value > max) {
7592 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007593 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 Py_DECREF(x);
7595 return -1;
7596 }
7597 *result = x;
7598 return 0;
7599 }
7600 else if (PyUnicode_Check(x)) {
7601 *result = x;
7602 return 0;
7603 }
7604 else {
7605 /* wrong return value */
7606 PyErr_SetString(PyExc_TypeError,
7607 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007608 Py_DECREF(x);
7609 return -1;
7610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611}
7612/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 if not reallocate and adjust various state variables.
7614 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007615static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007616charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007618{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007619 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007620 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 /* exponentially overallocate to minimize reallocations */
7622 if (requiredsize < 2 * oldsize)
7623 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007624 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7625 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007627 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007628 }
7629 return 0;
7630}
7631/* lookup the character, put the result in the output string and adjust
7632 various state variables. Return a new reference to the object that
7633 was put in the output buffer in *result, or Py_None, if the mapping was
7634 undefined (in which case no character was written).
7635 The called must decref result.
7636 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007638charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7639 PyObject *mapping, Py_UCS4 **output,
7640 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007641 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007643 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7644 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007646 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007648 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649 }
7650 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007652 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007654 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007655 }
7656 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007657 Py_ssize_t repsize;
7658 if (PyUnicode_READY(*res) == -1)
7659 return -1;
7660 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 if (repsize==1) {
7662 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007663 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 }
7665 else if (repsize!=0) {
7666 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007667 Py_ssize_t requiredsize = *opos +
7668 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007670 Py_ssize_t i;
7671 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007673 for(i = 0; i < repsize; i++)
7674 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 }
7677 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007679 return 0;
7680}
7681
Alexander Belopolsky40018472011-02-26 01:02:56 +00007682PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007683_PyUnicode_TranslateCharmap(PyObject *input,
7684 PyObject *mapping,
7685 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 /* input object */
7688 char *idata;
7689 Py_ssize_t size, i;
7690 int kind;
7691 /* output buffer */
7692 Py_UCS4 *output = NULL;
7693 Py_ssize_t osize;
7694 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007695 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007696 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 char *reason = "character maps to <undefined>";
7698 PyObject *errorHandler = NULL;
7699 PyObject *exc = NULL;
7700 /* the following variable is used for caching string comparisons
7701 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7702 * 3=ignore, 4=xmlcharrefreplace */
7703 int known_errorHandler = -1;
7704
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 PyErr_BadArgument();
7707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007710 if (PyUnicode_READY(input) == -1)
7711 return NULL;
7712 idata = (char*)PyUnicode_DATA(input);
7713 kind = PyUnicode_KIND(input);
7714 size = PyUnicode_GET_LENGTH(input);
7715 i = 0;
7716
7717 if (size == 0) {
7718 Py_INCREF(input);
7719 return input;
7720 }
7721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722 /* allocate enough for a simple 1:1 translation without
7723 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007724 osize = size;
7725 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7726 opos = 0;
7727 if (output == NULL) {
7728 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 /* try to encode it */
7734 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007735 if (charmaptranslate_output(input, i, mapping,
7736 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007737 Py_XDECREF(x);
7738 goto onError;
7739 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007740 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007741 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 else { /* untranslatable character */
7744 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7745 Py_ssize_t repsize;
7746 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007749 Py_ssize_t collstart = i;
7750 Py_ssize_t collend = i+1;
7751 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007754 while (collend < size) {
7755 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 goto onError;
7757 Py_XDECREF(x);
7758 if (x!=Py_None)
7759 break;
7760 ++collend;
7761 }
7762 /* cache callback name lookup
7763 * (if not done yet, i.e. it's the first error) */
7764 if (known_errorHandler==-1) {
7765 if ((errors==NULL) || (!strcmp(errors, "strict")))
7766 known_errorHandler = 1;
7767 else if (!strcmp(errors, "replace"))
7768 known_errorHandler = 2;
7769 else if (!strcmp(errors, "ignore"))
7770 known_errorHandler = 3;
7771 else if (!strcmp(errors, "xmlcharrefreplace"))
7772 known_errorHandler = 4;
7773 else
7774 known_errorHandler = 0;
7775 }
7776 switch (known_errorHandler) {
7777 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007778 raise_translate_exception(&exc, input, collstart,
7779 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 case 2: /* replace */
7782 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007783 for (coll = collstart; coll<collend; coll++)
7784 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 /* fall through */
7786 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007787 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 break;
7789 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007790 /* generate replacement (temporarily (mis)uses i) */
7791 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 char buffer[2+29+1+1];
7793 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007794 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7795 if (charmaptranslate_makespace(&output, &osize,
7796 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 goto onError;
7798 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007801 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 break;
7803 default:
7804 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007805 reason, input, &exc,
7806 collstart, collend, &newpos);
7807 if (repunicode == NULL || PyUnicode_READY(repunicode) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 goto onError;
7809 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007810 repsize = PyUnicode_GET_LENGTH(repunicode);
7811 if (charmaptranslate_makespace(&output, &osize,
7812 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 Py_DECREF(repunicode);
7814 goto onError;
7815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007816 for (uni2 = 0; repsize-->0; ++uni2)
7817 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7818 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007821 }
7822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7824 if (!res)
7825 goto onError;
7826 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 Py_XDECREF(exc);
7828 Py_XDECREF(errorHandler);
7829 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007833 Py_XDECREF(exc);
7834 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 return NULL;
7836}
7837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838/* Deprecated. Use PyUnicode_Translate instead. */
7839PyObject *
7840PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7841 Py_ssize_t size,
7842 PyObject *mapping,
7843 const char *errors)
7844{
7845 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7846 if (!unicode)
7847 return NULL;
7848 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7849}
7850
Alexander Belopolsky40018472011-02-26 01:02:56 +00007851PyObject *
7852PyUnicode_Translate(PyObject *str,
7853 PyObject *mapping,
7854 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855{
7856 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007857
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 str = PyUnicode_FromObject(str);
7859 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862 Py_DECREF(str);
7863 return result;
Tim Petersced69f82003-09-16 20:30:58 +00007864
Benjamin Peterson29060642009-01-31 22:14:21 +00007865 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 Py_XDECREF(str);
7867 return NULL;
7868}
Tim Petersced69f82003-09-16 20:30:58 +00007869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007870static Py_UCS4
7871fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7872{
7873 /* No need to call PyUnicode_READY(self) because this function is only
7874 called as a callback from fixup() which does it already. */
7875 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7876 const int kind = PyUnicode_KIND(self);
7877 void *data = PyUnicode_DATA(self);
7878 Py_UCS4 maxchar = 0, ch, fixed;
7879 Py_ssize_t i;
7880
7881 for (i = 0; i < len; ++i) {
7882 ch = PyUnicode_READ(kind, data, i);
7883 fixed = 0;
7884 if (ch > 127) {
7885 if (Py_UNICODE_ISSPACE(ch))
7886 fixed = ' ';
7887 else {
7888 const int decimal = Py_UNICODE_TODECIMAL(ch);
7889 if (decimal >= 0)
7890 fixed = '0' + decimal;
7891 }
7892 if (fixed != 0) {
7893 if (fixed > maxchar)
7894 maxchar = fixed;
7895 PyUnicode_WRITE(kind, data, i, fixed);
7896 }
7897 else if (ch > maxchar)
7898 maxchar = ch;
7899 }
7900 else if (ch > maxchar)
7901 maxchar = ch;
7902 }
7903
7904 return maxchar;
7905}
7906
7907PyObject *
7908_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7909{
7910 if (!PyUnicode_Check(unicode)) {
7911 PyErr_BadInternalCall();
7912 return NULL;
7913 }
7914 if (PyUnicode_READY(unicode) == -1)
7915 return NULL;
7916 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
7917 /* If the string is already ASCII, just return the same string */
7918 Py_INCREF(unicode);
7919 return unicode;
7920 }
7921 return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
7922}
7923
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007924PyObject *
7925PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
7926 Py_ssize_t length)
7927{
7928 PyObject *result;
7929 Py_UNICODE *p; /* write pointer into result */
7930 Py_ssize_t i;
7931 /* Copy to a new string */
7932 result = (PyObject *)_PyUnicode_New(length);
7933 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
7934 if (result == NULL)
7935 return result;
7936 p = PyUnicode_AS_UNICODE(result);
7937 /* Iterate over code points */
7938 for (i = 0; i < length; i++) {
7939 Py_UNICODE ch =s[i];
7940 if (ch > 127) {
7941 int decimal = Py_UNICODE_TODECIMAL(ch);
7942 if (decimal >= 0)
7943 p[i] = '0' + decimal;
7944 }
7945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
7947 Py_DECREF(result);
7948 return NULL;
7949 }
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00007950 return result;
7951}
Guido van Rossum9e896b32000-04-05 20:11:21 +00007952/* --- Decimal Encoder ---------------------------------------------------- */
7953
Alexander Belopolsky40018472011-02-26 01:02:56 +00007954int
7955PyUnicode_EncodeDecimal(Py_UNICODE *s,
7956 Py_ssize_t length,
7957 char *output,
7958 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00007959{
7960 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961 PyObject *errorHandler = NULL;
7962 PyObject *exc = NULL;
7963 const char *encoding = "decimal";
7964 const char *reason = "invalid decimal Unicode string";
7965 /* the following variable is used for caching string comparisons
7966 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
7967 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007968
7969 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 PyErr_BadArgument();
7971 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00007972 }
7973
7974 p = s;
7975 end = s + length;
7976 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 register Py_UNICODE ch = *p;
7978 int decimal;
7979 PyObject *repunicode;
7980 Py_ssize_t repsize;
7981 Py_ssize_t newpos;
7982 Py_UNICODE *uni2;
7983 Py_UNICODE *collstart;
7984 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00007985
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 ++p;
7989 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007990 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 decimal = Py_UNICODE_TODECIMAL(ch);
7992 if (decimal >= 0) {
7993 *output++ = '0' + decimal;
7994 ++p;
7995 continue;
7996 }
7997 if (0 < ch && ch < 256) {
7998 *output++ = (char)ch;
7999 ++p;
8000 continue;
8001 }
8002 /* All other characters are considered unencodable */
8003 collstart = p;
8004 collend = p+1;
8005 while (collend < end) {
8006 if ((0 < *collend && *collend < 256) ||
8007 !Py_UNICODE_ISSPACE(*collend) ||
8008 Py_UNICODE_TODECIMAL(*collend))
8009 break;
8010 }
8011 /* cache callback name lookup
8012 * (if not done yet, i.e. it's the first error) */
8013 if (known_errorHandler==-1) {
8014 if ((errors==NULL) || (!strcmp(errors, "strict")))
8015 known_errorHandler = 1;
8016 else if (!strcmp(errors, "replace"))
8017 known_errorHandler = 2;
8018 else if (!strcmp(errors, "ignore"))
8019 known_errorHandler = 3;
8020 else if (!strcmp(errors, "xmlcharrefreplace"))
8021 known_errorHandler = 4;
8022 else
8023 known_errorHandler = 0;
8024 }
8025 switch (known_errorHandler) {
8026 case 1: /* strict */
8027 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8028 goto onError;
8029 case 2: /* replace */
8030 for (p = collstart; p < collend; ++p)
8031 *output++ = '?';
8032 /* fall through */
8033 case 3: /* ignore */
8034 p = collend;
8035 break;
8036 case 4: /* xmlcharrefreplace */
8037 /* generate replacement (temporarily (mis)uses p) */
8038 for (p = collstart; p < collend; ++p)
8039 output += sprintf(output, "&#%d;", (int)*p);
8040 p = collend;
8041 break;
8042 default:
8043 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8044 encoding, reason, s, length, &exc,
8045 collstart-s, collend-s, &newpos);
8046 if (repunicode == NULL)
8047 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008048 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008049 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008050 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8051 Py_DECREF(repunicode);
8052 goto onError;
8053 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 /* generate replacement */
8055 repsize = PyUnicode_GET_SIZE(repunicode);
8056 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8057 Py_UNICODE ch = *uni2;
8058 if (Py_UNICODE_ISSPACE(ch))
8059 *output++ = ' ';
8060 else {
8061 decimal = Py_UNICODE_TODECIMAL(ch);
8062 if (decimal >= 0)
8063 *output++ = '0' + decimal;
8064 else if (0 < ch && ch < 256)
8065 *output++ = (char)ch;
8066 else {
8067 Py_DECREF(repunicode);
8068 raise_encode_exception(&exc, encoding,
8069 s, length, collstart-s, collend-s, reason);
8070 goto onError;
8071 }
8072 }
8073 }
8074 p = s + newpos;
8075 Py_DECREF(repunicode);
8076 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008077 }
8078 /* 0-terminate the output string */
8079 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 Py_XDECREF(exc);
8081 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008082 return 0;
8083
Benjamin Peterson29060642009-01-31 22:14:21 +00008084 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 Py_XDECREF(exc);
8086 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008087 return -1;
8088}
8089
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090/* --- Helpers ------------------------------------------------------------ */
8091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092#include "stringlib/ucs1lib.h"
8093#include "stringlib/fastsearch.h"
8094#include "stringlib/partition.h"
8095#include "stringlib/split.h"
8096#include "stringlib/count.h"
8097#include "stringlib/find.h"
8098#include "stringlib/localeutil.h"
8099#include "stringlib/undef.h"
8100
8101#include "stringlib/ucs2lib.h"
8102#include "stringlib/fastsearch.h"
8103#include "stringlib/partition.h"
8104#include "stringlib/split.h"
8105#include "stringlib/count.h"
8106#include "stringlib/find.h"
8107#include "stringlib/localeutil.h"
8108#include "stringlib/undef.h"
8109
8110#include "stringlib/ucs4lib.h"
8111#include "stringlib/fastsearch.h"
8112#include "stringlib/partition.h"
8113#include "stringlib/split.h"
8114#include "stringlib/count.h"
8115#include "stringlib/find.h"
8116#include "stringlib/localeutil.h"
8117#include "stringlib/undef.h"
8118
8119static Py_ssize_t
8120any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8121 const Py_UCS1*, Py_ssize_t,
8122 Py_ssize_t, Py_ssize_t),
8123 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8124 const Py_UCS2*, Py_ssize_t,
8125 Py_ssize_t, Py_ssize_t),
8126 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8127 const Py_UCS4*, Py_ssize_t,
8128 Py_ssize_t, Py_ssize_t),
8129 PyObject* s1, PyObject* s2,
8130 Py_ssize_t start,
8131 Py_ssize_t end)
8132{
8133 int kind1, kind2, kind;
8134 void *buf1, *buf2;
8135 Py_ssize_t len1, len2, result;
8136
8137 kind1 = PyUnicode_KIND(s1);
8138 kind2 = PyUnicode_KIND(s2);
8139 kind = kind1 > kind2 ? kind1 : kind2;
8140 buf1 = PyUnicode_DATA(s1);
8141 buf2 = PyUnicode_DATA(s2);
8142 if (kind1 != kind)
8143 buf1 = _PyUnicode_AsKind(s1, kind);
8144 if (!buf1)
8145 return -2;
8146 if (kind2 != kind)
8147 buf2 = _PyUnicode_AsKind(s2, kind);
8148 if (!buf2) {
8149 if (kind1 != kind) PyMem_Free(buf1);
8150 return -2;
8151 }
8152 len1 = PyUnicode_GET_LENGTH(s1);
8153 len2 = PyUnicode_GET_LENGTH(s2);
8154
8155 switch(kind) {
8156 case PyUnicode_1BYTE_KIND:
8157 result = ucs1(buf1, len1, buf2, len2, start, end);
8158 break;
8159 case PyUnicode_2BYTE_KIND:
8160 result = ucs2(buf1, len1, buf2, len2, start, end);
8161 break;
8162 case PyUnicode_4BYTE_KIND:
8163 result = ucs4(buf1, len1, buf2, len2, start, end);
8164 break;
8165 default:
8166 assert(0); result = -2;
8167 }
8168
8169 if (kind1 != kind)
8170 PyMem_Free(buf1);
8171 if (kind2 != kind)
8172 PyMem_Free(buf2);
8173
8174 return result;
8175}
8176
8177Py_ssize_t
8178_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8179 Py_ssize_t n_buffer,
8180 void *digits, Py_ssize_t n_digits,
8181 Py_ssize_t min_width,
8182 const char *grouping,
8183 const char *thousands_sep)
8184{
8185 switch(kind) {
8186 case PyUnicode_1BYTE_KIND:
8187 return _PyUnicode_ucs1_InsertThousandsGrouping(
8188 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8189 min_width, grouping, thousands_sep);
8190 case PyUnicode_2BYTE_KIND:
8191 return _PyUnicode_ucs2_InsertThousandsGrouping(
8192 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8193 min_width, grouping, thousands_sep);
8194 case PyUnicode_4BYTE_KIND:
8195 return _PyUnicode_ucs4_InsertThousandsGrouping(
8196 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8197 min_width, grouping, thousands_sep);
8198 }
8199 assert(0);
8200 return -1;
8201}
8202
8203
Eric Smith8c663262007-08-25 02:26:07 +00008204#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008205#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008206
Thomas Wouters477c8d52006-05-27 19:21:47 +00008207#include "stringlib/count.h"
8208#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008209
Thomas Wouters477c8d52006-05-27 19:21:47 +00008210/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008211#define ADJUST_INDICES(start, end, len) \
8212 if (end > len) \
8213 end = len; \
8214 else if (end < 0) { \
8215 end += len; \
8216 if (end < 0) \
8217 end = 0; \
8218 } \
8219 if (start < 0) { \
8220 start += len; \
8221 if (start < 0) \
8222 start = 0; \
8223 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008224
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225Py_ssize_t
8226PyUnicode_Count(PyObject *str,
8227 PyObject *substr,
8228 Py_ssize_t start,
8229 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008231 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008232 PyUnicodeObject* str_obj;
8233 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 int kind1, kind2, kind;
8235 void *buf1 = NULL, *buf2 = NULL;
8236 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008237
Thomas Wouters477c8d52006-05-27 19:21:47 +00008238 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008241 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008242 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 Py_DECREF(str_obj);
8244 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 }
Tim Petersced69f82003-09-16 20:30:58 +00008246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 kind1 = PyUnicode_KIND(str_obj);
8248 kind2 = PyUnicode_KIND(sub_obj);
8249 kind = kind1 > kind2 ? kind1 : kind2;
8250 buf1 = PyUnicode_DATA(str_obj);
8251 if (kind1 != kind)
8252 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8253 if (!buf1)
8254 goto onError;
8255 buf2 = PyUnicode_DATA(sub_obj);
8256 if (kind2 != kind)
8257 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8258 if (!buf2)
8259 goto onError;
8260 len1 = PyUnicode_GET_LENGTH(str_obj);
8261 len2 = PyUnicode_GET_LENGTH(sub_obj);
8262
8263 ADJUST_INDICES(start, end, len1);
8264 switch(kind) {
8265 case PyUnicode_1BYTE_KIND:
8266 result = ucs1lib_count(
8267 ((Py_UCS1*)buf1) + start, end - start,
8268 buf2, len2, PY_SSIZE_T_MAX
8269 );
8270 break;
8271 case PyUnicode_2BYTE_KIND:
8272 result = ucs2lib_count(
8273 ((Py_UCS2*)buf1) + start, end - start,
8274 buf2, len2, PY_SSIZE_T_MAX
8275 );
8276 break;
8277 case PyUnicode_4BYTE_KIND:
8278 result = ucs4lib_count(
8279 ((Py_UCS4*)buf1) + start, end - start,
8280 buf2, len2, PY_SSIZE_T_MAX
8281 );
8282 break;
8283 default:
8284 assert(0); result = 0;
8285 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008286
8287 Py_DECREF(sub_obj);
8288 Py_DECREF(str_obj);
8289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 if (kind1 != kind)
8291 PyMem_Free(buf1);
8292 if (kind2 != kind)
8293 PyMem_Free(buf2);
8294
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 onError:
8297 Py_DECREF(sub_obj);
8298 Py_DECREF(str_obj);
8299 if (kind1 != kind && buf1)
8300 PyMem_Free(buf1);
8301 if (kind2 != kind && buf2)
8302 PyMem_Free(buf2);
8303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304}
8305
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306Py_ssize_t
8307PyUnicode_Find(PyObject *str,
8308 PyObject *sub,
8309 Py_ssize_t start,
8310 Py_ssize_t end,
8311 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008313 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008314
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008318 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008319 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 Py_DECREF(str);
8321 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Tim Petersced69f82003-09-16 20:30:58 +00008323
Thomas Wouters477c8d52006-05-27 19:21:47 +00008324 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008325 result = any_find_slice(
8326 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8327 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008328 );
8329 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 result = any_find_slice(
8331 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8332 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008333 );
8334
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008336 Py_DECREF(sub);
8337
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 return result;
8339}
8340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341Py_ssize_t
8342PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8343 Py_ssize_t start, Py_ssize_t end,
8344 int direction)
8345{
8346 char *result;
8347 int kind;
8348 if (PyUnicode_READY(str) == -1)
8349 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008350 if (start < 0 || end < 0) {
8351 PyErr_SetString(PyExc_IndexError, "string index out of range");
8352 return -2;
8353 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008354 if (end > PyUnicode_GET_LENGTH(str))
8355 end = PyUnicode_GET_LENGTH(str);
8356 kind = PyUnicode_KIND(str);
8357 result = findchar(PyUnicode_1BYTE_DATA(str)
8358 + PyUnicode_KIND_SIZE(kind, start),
8359 kind,
8360 end-start, ch, direction);
8361 if (!result)
8362 return -1;
8363 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8364}
8365
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366static int
8367tailmatch(PyUnicodeObject *self,
8368 PyUnicodeObject *substring,
8369 Py_ssize_t start,
8370 Py_ssize_t end,
8371 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 int kind_self;
8374 int kind_sub;
8375 void *data_self;
8376 void *data_sub;
8377 Py_ssize_t offset;
8378 Py_ssize_t i;
8379 Py_ssize_t end_sub;
8380
8381 if (PyUnicode_READY(self) == -1 ||
8382 PyUnicode_READY(substring) == -1)
8383 return 0;
8384
8385 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 return 1;
8387
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008388 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8389 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 kind_self = PyUnicode_KIND(self);
8394 data_self = PyUnicode_DATA(self);
8395 kind_sub = PyUnicode_KIND(substring);
8396 data_sub = PyUnicode_DATA(substring);
8397 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8398
8399 if (direction > 0)
8400 offset = end;
8401 else
8402 offset = start;
8403
8404 if (PyUnicode_READ(kind_self, data_self, offset) ==
8405 PyUnicode_READ(kind_sub, data_sub, 0) &&
8406 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8407 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8408 /* If both are of the same kind, memcmp is sufficient */
8409 if (kind_self == kind_sub) {
8410 return ! memcmp((char *)data_self +
8411 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8412 data_sub,
8413 PyUnicode_GET_LENGTH(substring) *
8414 PyUnicode_CHARACTER_SIZE(substring));
8415 }
8416 /* otherwise we have to compare each character by first accesing it */
8417 else {
8418 /* We do not need to compare 0 and len(substring)-1 because
8419 the if statement above ensured already that they are equal
8420 when we end up here. */
8421 // TODO: honor direction and do a forward or backwards search
8422 for (i = 1; i < end_sub; ++i) {
8423 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8424 PyUnicode_READ(kind_sub, data_sub, i))
8425 return 0;
8426 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 }
8430
8431 return 0;
8432}
8433
Alexander Belopolsky40018472011-02-26 01:02:56 +00008434Py_ssize_t
8435PyUnicode_Tailmatch(PyObject *str,
8436 PyObject *substr,
8437 Py_ssize_t start,
8438 Py_ssize_t end,
8439 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008440{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008441 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008442
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 str = PyUnicode_FromObject(str);
8444 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 substr = PyUnicode_FromObject(substr);
8447 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 Py_DECREF(str);
8449 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 }
Tim Petersced69f82003-09-16 20:30:58 +00008451
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 (PyUnicodeObject *)substr,
8454 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455 Py_DECREF(str);
8456 Py_DECREF(substr);
8457 return result;
8458}
8459
Guido van Rossumd57fd912000-03-10 22:53:23 +00008460/* Apply fixfct filter to the Unicode object self and return a
8461 reference to the modified object */
8462
Alexander Belopolsky40018472011-02-26 01:02:56 +00008463static PyObject *
8464fixup(PyUnicodeObject *self,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 Py_UCS4 (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 PyObject *u;
8468 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008470 if (PyUnicode_READY(self) == -1)
8471 return NULL;
8472 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8473 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8474 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8479 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 /* fix functions return the new maximum character in a string,
8482 if the kind of the resulting unicode object does not change,
8483 everything is fine. Otherwise we need to change the string kind
8484 and re-run the fix function. */
8485 maxchar_new = fixfct((PyUnicodeObject*)u);
8486 if (maxchar_new == 0)
8487 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8488 else if (maxchar_new <= 127)
8489 maxchar_new = 127;
8490 else if (maxchar_new <= 255)
8491 maxchar_new = 255;
8492 else if (maxchar_new <= 65535)
8493 maxchar_new = 65535;
8494 else
8495 maxchar_new = 1114111; /* 0x10ffff */
8496
8497 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 /* fixfct should return TRUE if it modified the buffer. If
8499 FALSE, return a reference to the original buffer instead
8500 (to save space, not time) */
8501 Py_INCREF(self);
8502 Py_DECREF(u);
8503 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 else if (maxchar_new == maxchar_old) {
8506 return u;
8507 }
8508 else {
8509 /* In case the maximum character changed, we need to
8510 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008511 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 if (v == NULL) {
8513 Py_DECREF(u);
8514 return NULL;
8515 }
8516 if (maxchar_new > maxchar_old) {
8517 /* If the maxchar increased so that the kind changed, not all
8518 characters are representable anymore and we need to fix the
8519 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008520 if (PyUnicode_CopyCharacters(v, 0,
8521 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008522 PyUnicode_GET_LENGTH(self)) < 0)
8523 {
8524 Py_DECREF(u);
8525 return NULL;
8526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 maxchar_old = fixfct((PyUnicodeObject*)v);
8528 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8529 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008530 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008531 if (PyUnicode_CopyCharacters(v, 0,
8532 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008533 PyUnicode_GET_LENGTH(self)) < 0)
8534 {
8535 Py_DECREF(u);
8536 return NULL;
8537 }
8538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539
8540 Py_DECREF(u);
8541 return v;
8542 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543}
8544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008546fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 /* No need to call PyUnicode_READY(self) because this function is only
8549 called as a callback from fixup() which does it already. */
8550 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8551 const int kind = PyUnicode_KIND(self);
8552 void *data = PyUnicode_DATA(self);
8553 int touched = 0;
8554 Py_UCS4 maxchar = 0;
8555 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 for (i = 0; i < len; ++i) {
8558 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8559 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8560 if (up != ch) {
8561 if (up > maxchar)
8562 maxchar = up;
8563 PyUnicode_WRITE(kind, data, i, up);
8564 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 else if (ch > maxchar)
8567 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 }
8569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 if (touched)
8571 return maxchar;
8572 else
8573 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574}
8575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8580 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8581 const int kind = PyUnicode_KIND(self);
8582 void *data = PyUnicode_DATA(self);
8583 int touched = 0;
8584 Py_UCS4 maxchar = 0;
8585 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 for(i = 0; i < len; ++i) {
8588 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8589 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8590 if (lo != ch) {
8591 if (lo > maxchar)
8592 maxchar = lo;
8593 PyUnicode_WRITE(kind, data, i, lo);
8594 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 else if (ch > maxchar)
8597 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 }
8599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 if (touched)
8601 return maxchar;
8602 else
8603 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008607fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8610 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8611 const int kind = PyUnicode_KIND(self);
8612 void *data = PyUnicode_DATA(self);
8613 int touched = 0;
8614 Py_UCS4 maxchar = 0;
8615 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 for(i = 0; i < len; ++i) {
8618 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8619 Py_UCS4 nu = 0;
8620
8621 if (Py_UNICODE_ISUPPER(ch))
8622 nu = Py_UNICODE_TOLOWER(ch);
8623 else if (Py_UNICODE_ISLOWER(ch))
8624 nu = Py_UNICODE_TOUPPER(ch);
8625
8626 if (nu != 0) {
8627 if (nu > maxchar)
8628 maxchar = nu;
8629 PyUnicode_WRITE(kind, data, i, nu);
8630 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 else if (ch > maxchar)
8633 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634 }
8635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 if (touched)
8637 return maxchar;
8638 else
8639 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640}
8641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8646 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8647 const int kind = PyUnicode_KIND(self);
8648 void *data = PyUnicode_DATA(self);
8649 int touched = 0;
8650 Py_UCS4 maxchar = 0;
8651 Py_ssize_t i = 0;
8652 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008653
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008654 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656
8657 ch = PyUnicode_READ(kind, data, i);
8658 if (!Py_UNICODE_ISUPPER(ch)) {
8659 maxchar = Py_UNICODE_TOUPPER(ch);
8660 PyUnicode_WRITE(kind, data, i, maxchar);
8661 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 ++i;
8664 for(; i < len; ++i) {
8665 ch = PyUnicode_READ(kind, data, i);
8666 if (!Py_UNICODE_ISLOWER(ch)) {
8667 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8668 if (lo > maxchar)
8669 maxchar = lo;
8670 PyUnicode_WRITE(kind, data, i, lo);
8671 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 else if (ch > maxchar)
8674 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676
8677 if (touched)
8678 return maxchar;
8679 else
8680 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681}
8682
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683static Py_UCS4
Alexander Belopolsky40018472011-02-26 01:02:56 +00008684fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8687 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8688 const int kind = PyUnicode_KIND(self);
8689 void *data = PyUnicode_DATA(self);
8690 Py_UCS4 maxchar = 0;
8691 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692 int previous_is_cased;
8693
8694 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 if (len == 1) {
8696 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8697 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8698 if (ti != ch) {
8699 PyUnicode_WRITE(kind, data, i, ti);
8700 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 }
8702 else
8703 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 for(; i < len; ++i) {
8707 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8708 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008709
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 nu = Py_UNICODE_TOTITLE(ch);
8714
8715 if (nu > maxchar)
8716 maxchar = nu;
8717 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008718
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 if (Py_UNICODE_ISLOWER(ch) ||
8720 Py_UNICODE_ISUPPER(ch) ||
8721 Py_UNICODE_ISTITLE(ch))
8722 previous_is_cased = 1;
8723 else
8724 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727}
8728
Tim Peters8ce9f162004-08-27 01:49:32 +00008729PyObject *
8730PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008733 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00008735 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008736 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
8737 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00008738 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 Py_ssize_t sz, i, res_offset;
8740 Py_UCS4 maxchar = 0;
8741 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742
Tim Peters05eba1f2004-08-27 21:32:02 +00008743 fseq = PySequence_Fast(seq, "");
8744 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008745 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00008746 }
8747
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008748 /* NOTE: the following code can't call back into Python code,
8749 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00008750 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008751
Tim Peters05eba1f2004-08-27 21:32:02 +00008752 seqlen = PySequence_Fast_GET_SIZE(fseq);
8753 /* If empty sequence, return u"". */
8754 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008756 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00008757 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008758 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00008759 /* If singleton sequence with an exact Unicode, return that. */
8760 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 item = items[0];
8762 if (PyUnicode_CheckExact(item)) {
8763 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 goto Done;
8766 }
Tim Peters8ce9f162004-08-27 01:49:32 +00008767 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008768 else {
8769 /* Set up sep and seplen */
8770 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008771 /* fall back to a blank space separator */
8772 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02008773 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00008775 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008776 else {
8777 if (!PyUnicode_Check(separator)) {
8778 PyErr_Format(PyExc_TypeError,
8779 "separator: expected str instance,"
8780 " %.80s found",
8781 Py_TYPE(separator)->tp_name);
8782 goto onError;
8783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784 if (PyUnicode_READY(separator) == -1)
8785 goto onError;
8786 sep = separator;
8787 seplen = PyUnicode_GET_LENGTH(separator);
8788 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8789 /* inc refcount to keep this code path symetric with the
8790 above case of a blank separator */
8791 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00008792 }
8793 }
8794
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008795 /* There are at least two things to join, or else we have a subclass
8796 * of str in the sequence.
8797 * Do a pre-pass to figure out the total amount of space we'll
8798 * need (sz), and see whether all argument are strings.
8799 */
8800 sz = 0;
8801 for (i = 0; i < seqlen; i++) {
8802 const Py_ssize_t old_sz = sz;
8803 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 if (!PyUnicode_Check(item)) {
8805 PyErr_Format(PyExc_TypeError,
8806 "sequence item %zd: expected str instance,"
8807 " %.80s found",
8808 i, Py_TYPE(item)->tp_name);
8809 goto onError;
8810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 if (PyUnicode_READY(item) == -1)
8812 goto onError;
8813 sz += PyUnicode_GET_LENGTH(item);
8814 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8815 if (item_maxchar > maxchar)
8816 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008817 if (i != 0)
8818 sz += seplen;
8819 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8820 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008822 goto onError;
8823 }
8824 }
Tim Petersced69f82003-09-16 20:30:58 +00008825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008827 if (res == NULL)
8828 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00008829
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008830 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +00008832 Py_ssize_t itemlen;
8833 item = items[i];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 itemlen = PyUnicode_GET_LENGTH(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 /* Copy item, and maybe the separator. */
8836 if (i) {
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008837 if (PyUnicode_CopyCharacters(res, res_offset,
8838 sep, 0, seplen) < 0)
8839 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 res_offset += seplen;
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008842 if (PyUnicode_CopyCharacters(res, res_offset,
8843 item, 0, itemlen) < 0)
8844 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 res_offset += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00008846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008847 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00008848
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00008850 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 Py_XDECREF(sep);
8852 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00008855 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00008857 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 return NULL;
8859}
8860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861#define FILL(kind, data, value, start, length) \
8862 do { \
8863 Py_ssize_t i_ = 0; \
8864 assert(kind != PyUnicode_WCHAR_KIND); \
8865 switch ((kind)) { \
8866 case PyUnicode_1BYTE_KIND: { \
8867 unsigned char * to_ = (unsigned char *)((data)) + (start); \
8868 memset(to_, (unsigned char)value, length); \
8869 break; \
8870 } \
8871 case PyUnicode_2BYTE_KIND: { \
8872 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8873 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8874 break; \
8875 } \
8876 default: { \
8877 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8878 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8879 break; \
8880 } \
8881 } \
8882 } while (0)
8883
Alexander Belopolsky40018472011-02-26 01:02:56 +00008884static PyUnicodeObject *
8885pad(PyUnicodeObject *self,
8886 Py_ssize_t left,
8887 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008890 PyObject *u;
8891 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008892 int kind;
8893 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008894
8895 if (left < 0)
8896 left = 0;
8897 if (right < 0)
8898 right = 0;
8899
Tim Peters7a29bd52001-09-12 03:03:31 +00008900 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901 Py_INCREF(self);
8902 return self;
8903 }
8904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
8906 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00008907 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
8908 return NULL;
8909 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008910 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
8911 if (fill > maxchar)
8912 maxchar = fill;
8913 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008914 if (!u)
8915 return NULL;
8916
8917 kind = PyUnicode_KIND(u);
8918 data = PyUnicode_DATA(u);
8919 if (left)
8920 FILL(kind, data, fill, 0, left);
8921 if (right)
8922 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02008923 if (PyUnicode_CopyCharacters(u, left,
8924 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008925 _PyUnicode_LENGTH(self)) < 0)
8926 {
8927 Py_DECREF(u);
8928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929 }
8930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008931 return (PyUnicodeObject*)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934
Alexander Belopolsky40018472011-02-26 01:02:56 +00008935PyObject *
8936PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939
8940 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 switch(PyUnicode_KIND(string)) {
8945 case PyUnicode_1BYTE_KIND:
8946 list = ucs1lib_splitlines(
8947 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
8948 PyUnicode_GET_LENGTH(string), keepends);
8949 break;
8950 case PyUnicode_2BYTE_KIND:
8951 list = ucs2lib_splitlines(
8952 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
8953 PyUnicode_GET_LENGTH(string), keepends);
8954 break;
8955 case PyUnicode_4BYTE_KIND:
8956 list = ucs4lib_splitlines(
8957 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
8958 PyUnicode_GET_LENGTH(string), keepends);
8959 break;
8960 default:
8961 assert(0);
8962 list = 0;
8963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964 Py_DECREF(string);
8965 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966}
8967
Alexander Belopolsky40018472011-02-26 01:02:56 +00008968static PyObject *
8969split(PyUnicodeObject *self,
8970 PyUnicodeObject *substring,
8971 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 int kind1, kind2, kind;
8974 void *buf1, *buf2;
8975 Py_ssize_t len1, len2;
8976 PyObject* out;
8977
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008979 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 if (PyUnicode_READY(self) == -1)
8982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 if (substring == NULL)
8985 switch(PyUnicode_KIND(self)) {
8986 case PyUnicode_1BYTE_KIND:
8987 return ucs1lib_split_whitespace(
8988 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
8989 PyUnicode_GET_LENGTH(self), maxcount
8990 );
8991 case PyUnicode_2BYTE_KIND:
8992 return ucs2lib_split_whitespace(
8993 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
8994 PyUnicode_GET_LENGTH(self), maxcount
8995 );
8996 case PyUnicode_4BYTE_KIND:
8997 return ucs4lib_split_whitespace(
8998 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
8999 PyUnicode_GET_LENGTH(self), maxcount
9000 );
9001 default:
9002 assert(0);
9003 return NULL;
9004 }
9005
9006 if (PyUnicode_READY(substring) == -1)
9007 return NULL;
9008
9009 kind1 = PyUnicode_KIND(self);
9010 kind2 = PyUnicode_KIND(substring);
9011 kind = kind1 > kind2 ? kind1 : kind2;
9012 buf1 = PyUnicode_DATA(self);
9013 buf2 = PyUnicode_DATA(substring);
9014 if (kind1 != kind)
9015 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9016 if (!buf1)
9017 return NULL;
9018 if (kind2 != kind)
9019 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9020 if (!buf2) {
9021 if (kind1 != kind) PyMem_Free(buf1);
9022 return NULL;
9023 }
9024 len1 = PyUnicode_GET_LENGTH(self);
9025 len2 = PyUnicode_GET_LENGTH(substring);
9026
9027 switch(kind) {
9028 case PyUnicode_1BYTE_KIND:
9029 out = ucs1lib_split(
9030 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9031 break;
9032 case PyUnicode_2BYTE_KIND:
9033 out = ucs2lib_split(
9034 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9035 break;
9036 case PyUnicode_4BYTE_KIND:
9037 out = ucs4lib_split(
9038 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9039 break;
9040 default:
9041 out = NULL;
9042 }
9043 if (kind1 != kind)
9044 PyMem_Free(buf1);
9045 if (kind2 != kind)
9046 PyMem_Free(buf2);
9047 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048}
9049
Alexander Belopolsky40018472011-02-26 01:02:56 +00009050static PyObject *
9051rsplit(PyUnicodeObject *self,
9052 PyUnicodeObject *substring,
9053 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 int kind1, kind2, kind;
9056 void *buf1, *buf2;
9057 Py_ssize_t len1, len2;
9058 PyObject* out;
9059
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009060 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009061 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (PyUnicode_READY(self) == -1)
9064 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 if (substring == NULL)
9067 switch(PyUnicode_KIND(self)) {
9068 case PyUnicode_1BYTE_KIND:
9069 return ucs1lib_rsplit_whitespace(
9070 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9071 PyUnicode_GET_LENGTH(self), maxcount
9072 );
9073 case PyUnicode_2BYTE_KIND:
9074 return ucs2lib_rsplit_whitespace(
9075 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9076 PyUnicode_GET_LENGTH(self), maxcount
9077 );
9078 case PyUnicode_4BYTE_KIND:
9079 return ucs4lib_rsplit_whitespace(
9080 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9081 PyUnicode_GET_LENGTH(self), maxcount
9082 );
9083 default:
9084 assert(0);
9085 return NULL;
9086 }
9087
9088 if (PyUnicode_READY(substring) == -1)
9089 return NULL;
9090
9091 kind1 = PyUnicode_KIND(self);
9092 kind2 = PyUnicode_KIND(substring);
9093 kind = kind1 > kind2 ? kind1 : kind2;
9094 buf1 = PyUnicode_DATA(self);
9095 buf2 = PyUnicode_DATA(substring);
9096 if (kind1 != kind)
9097 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9098 if (!buf1)
9099 return NULL;
9100 if (kind2 != kind)
9101 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9102 if (!buf2) {
9103 if (kind1 != kind) PyMem_Free(buf1);
9104 return NULL;
9105 }
9106 len1 = PyUnicode_GET_LENGTH(self);
9107 len2 = PyUnicode_GET_LENGTH(substring);
9108
9109 switch(kind) {
9110 case PyUnicode_1BYTE_KIND:
9111 out = ucs1lib_rsplit(
9112 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9113 break;
9114 case PyUnicode_2BYTE_KIND:
9115 out = ucs2lib_rsplit(
9116 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9117 break;
9118 case PyUnicode_4BYTE_KIND:
9119 out = ucs4lib_rsplit(
9120 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9121 break;
9122 default:
9123 out = NULL;
9124 }
9125 if (kind1 != kind)
9126 PyMem_Free(buf1);
9127 if (kind2 != kind)
9128 PyMem_Free(buf2);
9129 return out;
9130}
9131
9132static Py_ssize_t
9133anylib_find(int kind, void *buf1, Py_ssize_t len1,
9134 void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9135{
9136 switch(kind) {
9137 case PyUnicode_1BYTE_KIND:
9138 return ucs1lib_find(buf1, len1, buf2, len2, offset);
9139 case PyUnicode_2BYTE_KIND:
9140 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9141 case PyUnicode_4BYTE_KIND:
9142 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9143 }
9144 assert(0);
9145 return -1;
9146}
9147
9148static Py_ssize_t
9149anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9150 void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9151{
9152 switch(kind) {
9153 case PyUnicode_1BYTE_KIND:
9154 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9155 case PyUnicode_2BYTE_KIND:
9156 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9157 case PyUnicode_4BYTE_KIND:
9158 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9159 }
9160 assert(0);
9161 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009162}
9163
Alexander Belopolsky40018472011-02-26 01:02:56 +00009164static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009165replace(PyObject *self, PyObject *str1,
9166 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 PyObject *u;
9169 char *sbuf = PyUnicode_DATA(self);
9170 char *buf1 = PyUnicode_DATA(str1);
9171 char *buf2 = PyUnicode_DATA(str2);
9172 int srelease = 0, release1 = 0, release2 = 0;
9173 int skind = PyUnicode_KIND(self);
9174 int kind1 = PyUnicode_KIND(str1);
9175 int kind2 = PyUnicode_KIND(str2);
9176 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9177 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9178 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179
9180 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009183 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 if (skind < kind1)
9186 /* substring too wide to be present */
9187 goto nothing;
9188
9189 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009190 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009191 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009193 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009195 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 Py_UCS4 u1, u2, maxchar;
9197 int mayshrink, rkind;
9198 u1 = PyUnicode_READ_CHAR(str1, 0);
9199 if (!findchar(sbuf, PyUnicode_KIND(self),
9200 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009201 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 u2 = PyUnicode_READ_CHAR(str2, 0);
9203 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9204 /* Replacing u1 with u2 may cause a maxchar reduction in the
9205 result string. */
9206 mayshrink = maxchar > 127;
9207 if (u2 > maxchar) {
9208 maxchar = u2;
9209 mayshrink = 0;
9210 }
9211 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009212 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009214 if (PyUnicode_CopyCharacters(u, 0,
9215 (PyObject*)self, 0, slen) < 0)
9216 {
9217 Py_DECREF(u);
9218 return NULL;
9219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 rkind = PyUnicode_KIND(u);
9221 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9222 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223 if (--maxcount < 0)
9224 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (mayshrink) {
9228 PyObject *tmp = u;
9229 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9230 PyUnicode_GET_LENGTH(tmp));
9231 Py_DECREF(tmp);
9232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 int rkind = skind;
9235 char *res;
9236 if (kind1 < rkind) {
9237 /* widen substring */
9238 buf1 = _PyUnicode_AsKind(str1, rkind);
9239 if (!buf1) goto error;
9240 release1 = 1;
9241 }
9242 i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009243 if (i < 0)
9244 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (rkind > kind2) {
9246 /* widen replacement */
9247 buf2 = _PyUnicode_AsKind(str2, rkind);
9248 if (!buf2) goto error;
9249 release2 = 1;
9250 }
9251 else if (rkind < kind2) {
9252 /* widen self and buf1 */
9253 rkind = kind2;
9254 if (release1) PyMem_Free(buf1);
9255 sbuf = _PyUnicode_AsKind(self, rkind);
9256 if (!sbuf) goto error;
9257 srelease = 1;
9258 buf1 = _PyUnicode_AsKind(str1, rkind);
9259 if (!buf1) goto error;
9260 release1 = 1;
9261 }
9262 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9263 if (!res) {
9264 PyErr_NoMemory();
9265 goto error;
9266 }
9267 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009268 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9270 buf2,
9271 PyUnicode_KIND_SIZE(rkind, len2));
9272 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009273
9274 while ( --maxcount > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9276 slen-i,
9277 buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009278 if (i == -1)
9279 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9281 buf2,
9282 PyUnicode_KIND_SIZE(rkind, len2));
9283 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285
9286 u = PyUnicode_FromKindAndData(rkind, res, slen);
9287 PyMem_Free(res);
9288 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292 Py_ssize_t n, i, j, ires;
9293 Py_ssize_t product, new_size;
9294 int rkind = skind;
9295 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 if (kind1 < rkind) {
9298 buf1 = _PyUnicode_AsKind(str1, rkind);
9299 if (!buf1) goto error;
9300 release1 = 1;
9301 }
9302 n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009303 if (n == 0)
9304 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 if (kind2 < rkind) {
9306 buf2 = _PyUnicode_AsKind(str2, rkind);
9307 if (!buf2) goto error;
9308 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 else if (kind2 > rkind) {
9311 rkind = kind2;
9312 sbuf = _PyUnicode_AsKind(self, rkind);
9313 if (!sbuf) goto error;
9314 srelease = 1;
9315 if (release1) PyMem_Free(buf1);
9316 buf1 = _PyUnicode_AsKind(str1, rkind);
9317 if (!buf1) goto error;
9318 release1 = 1;
9319 }
9320 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9321 PyUnicode_GET_LENGTH(str1))); */
9322 product = n * (len2-len1);
9323 if ((product / (len2-len1)) != n) {
9324 PyErr_SetString(PyExc_OverflowError,
9325 "replace string is too long");
9326 goto error;
9327 }
9328 new_size = slen + product;
9329 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9330 PyErr_SetString(PyExc_OverflowError,
9331 "replace string is too long");
9332 goto error;
9333 }
9334 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9335 if (!res)
9336 goto error;
9337 ires = i = 0;
9338 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009339 while (n-- > 0) {
9340 /* look for next match */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 j = anylib_find(rkind,
9342 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9343 slen-i, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009344 if (j == -1)
9345 break;
9346 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009347 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009348 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9349 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9350 PyUnicode_KIND_SIZE(rkind, j-i));
9351 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009352 }
9353 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354 if (len2 > 0) {
9355 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9356 buf2,
9357 PyUnicode_KIND_SIZE(rkind, len2));
9358 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009362 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009363 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9365 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9366 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009367 } else {
9368 /* interleave */
9369 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9371 buf2,
9372 PyUnicode_KIND_SIZE(rkind, len2));
9373 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009374 if (--n <= 0)
9375 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9377 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9378 PyUnicode_KIND_SIZE(rkind, 1));
9379 ires++;
9380 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9383 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9384 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009387 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 if (srelease)
9390 PyMem_FREE(sbuf);
9391 if (release1)
9392 PyMem_FREE(buf1);
9393 if (release2)
9394 PyMem_FREE(buf2);
9395 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009396
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009398 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 if (srelease)
9400 PyMem_FREE(sbuf);
9401 if (release1)
9402 PyMem_FREE(buf1);
9403 if (release2)
9404 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009405 if (PyUnicode_CheckExact(self)) {
9406 Py_INCREF(self);
9407 return (PyObject *) self;
9408 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009409 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 error:
9411 if (srelease && sbuf)
9412 PyMem_FREE(sbuf);
9413 if (release1 && buf1)
9414 PyMem_FREE(buf1);
9415 if (release2 && buf2)
9416 PyMem_FREE(buf2);
9417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418}
9419
9420/* --- Unicode Object Methods --------------------------------------------- */
9421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009422PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424\n\
9425Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009426characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427
9428static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009429unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009430{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 return fixup(self, fixtitle);
9432}
9433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009434PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436\n\
9437Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009438have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439
9440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009441unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 return fixup(self, fixcapitalize);
9444}
9445
9446#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009447PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449\n\
9450Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009451normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452
9453static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009454unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
9456 PyObject *list;
9457 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009458 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460 /* Split into words */
9461 list = split(self, NULL, -1);
9462 if (!list)
9463 return NULL;
9464
9465 /* Capitalize each word */
9466 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9467 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 if (item == NULL)
9470 goto onError;
9471 Py_DECREF(PyList_GET_ITEM(list, i));
9472 PyList_SET_ITEM(list, i, item);
9473 }
9474
9475 /* Join the words to form a new string */
9476 item = PyUnicode_Join(NULL, list);
9477
Benjamin Peterson29060642009-01-31 22:14:21 +00009478 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 Py_DECREF(list);
9480 return (PyObject *)item;
9481}
9482#endif
9483
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009484/* Argument converter. Coerces to a single unicode character */
9485
9486static int
9487convert_uc(PyObject *obj, void *addr)
9488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009490 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009491
Benjamin Peterson14339b62009-01-31 16:36:08 +00009492 uniobj = PyUnicode_FromObject(obj);
9493 if (uniobj == NULL) {
9494 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009496 return 0;
9497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009501 Py_DECREF(uniobj);
9502 return 0;
9503 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009505 Py_DECREF(uniobj);
9506 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009507}
9508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009509PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009512Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009513done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514
9515static PyObject *
9516unicode_center(PyUnicodeObject *self, PyObject *args)
9517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009518 Py_ssize_t marg, left;
9519 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 Py_UCS4 fillchar = ' ';
9521
Victor Stinnere9a29352011-10-01 02:14:59 +02009522 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524
Victor Stinnere9a29352011-10-01 02:14:59 +02009525 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526 return NULL;
9527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529 Py_INCREF(self);
9530 return (PyObject*) self;
9531 }
9532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 left = marg / 2 + (marg & width & 1);
9535
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009536 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009537}
9538
Marc-André Lemburge5034372000-08-08 08:04:29 +00009539#if 0
9540
9541/* This code should go into some future Unicode collation support
9542 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009543 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009544
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009545/* speedy UTF-16 code point order comparison */
9546/* gleaned from: */
9547/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9548
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009549static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009550{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009551 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009552 0, 0, 0, 0, 0, 0, 0, 0,
9553 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009554 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009555};
9556
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557static int
9558unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9559{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009560 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009561
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562 Py_UNICODE *s1 = str1->str;
9563 Py_UNICODE *s2 = str2->str;
9564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 len1 = str1->_base._base.length;
9566 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009567
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009569 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009570
9571 c1 = *s1++;
9572 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009573
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 if (c1 > (1<<11) * 26)
9575 c1 += utf16Fixup[c1>>11];
9576 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009577 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009578 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009579
9580 if (c1 != c2)
9581 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009582
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009583 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 }
9585
9586 return (len1 < len2) ? -1 : (len1 != len2);
9587}
9588
Marc-André Lemburge5034372000-08-08 08:04:29 +00009589#else
9590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591/* This function assumes that str1 and str2 are readied by the caller. */
9592
Marc-André Lemburge5034372000-08-08 08:04:29 +00009593static int
9594unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 int kind1, kind2;
9597 void *data1, *data2;
9598 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 kind1 = PyUnicode_KIND(str1);
9601 kind2 = PyUnicode_KIND(str2);
9602 data1 = PyUnicode_DATA(str1);
9603 data2 = PyUnicode_DATA(str2);
9604 len1 = PyUnicode_GET_LENGTH(str1);
9605 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 for (i = 0; i < len1 && i < len2; ++i) {
9608 Py_UCS4 c1, c2;
9609 c1 = PyUnicode_READ(kind1, data1, i);
9610 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009611
9612 if (c1 != c2)
9613 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009614 }
9615
9616 return (len1 < len2) ? -1 : (len1 != len2);
9617}
9618
9619#endif
9620
Alexander Belopolsky40018472011-02-26 01:02:56 +00009621int
9622PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9625 if (PyUnicode_READY(left) == -1 ||
9626 PyUnicode_READY(right) == -1)
9627 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009628 return unicode_compare((PyUnicodeObject *)left,
9629 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009631 PyErr_Format(PyExc_TypeError,
9632 "Can't compare %.100s and %.100s",
9633 left->ob_type->tp_name,
9634 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 return -1;
9636}
9637
Martin v. Löwis5b222132007-06-10 09:51:05 +00009638int
9639PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9640{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 Py_ssize_t i;
9642 int kind;
9643 void *data;
9644 Py_UCS4 chr;
9645
Victor Stinner910337b2011-10-03 03:20:16 +02009646 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 if (PyUnicode_READY(uni) == -1)
9648 return -1;
9649 kind = PyUnicode_KIND(uni);
9650 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009651 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9653 if (chr != str[i])
9654 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009655 /* This check keeps Python strings that end in '\0' from comparing equal
9656 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009659 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009661 return 0;
9662}
9663
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009664
Benjamin Peterson29060642009-01-31 22:14:21 +00009665#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009666 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009667
Alexander Belopolsky40018472011-02-26 01:02:56 +00009668PyObject *
9669PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009670{
9671 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009672
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009673 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9674 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 if (PyUnicode_READY(left) == -1 ||
9676 PyUnicode_READY(right) == -1)
9677 return NULL;
9678 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9679 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009680 if (op == Py_EQ) {
9681 Py_INCREF(Py_False);
9682 return Py_False;
9683 }
9684 if (op == Py_NE) {
9685 Py_INCREF(Py_True);
9686 return Py_True;
9687 }
9688 }
9689 if (left == right)
9690 result = 0;
9691 else
9692 result = unicode_compare((PyUnicodeObject *)left,
9693 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009694
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009695 /* Convert the return value to a Boolean */
9696 switch (op) {
9697 case Py_EQ:
9698 v = TEST_COND(result == 0);
9699 break;
9700 case Py_NE:
9701 v = TEST_COND(result != 0);
9702 break;
9703 case Py_LE:
9704 v = TEST_COND(result <= 0);
9705 break;
9706 case Py_GE:
9707 v = TEST_COND(result >= 0);
9708 break;
9709 case Py_LT:
9710 v = TEST_COND(result == -1);
9711 break;
9712 case Py_GT:
9713 v = TEST_COND(result == 1);
9714 break;
9715 default:
9716 PyErr_BadArgument();
9717 return NULL;
9718 }
9719 Py_INCREF(v);
9720 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009722
Brian Curtindfc80e32011-08-10 20:28:54 -05009723 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009724}
9725
Alexander Belopolsky40018472011-02-26 01:02:56 +00009726int
9727PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00009728{
Thomas Wouters477c8d52006-05-27 19:21:47 +00009729 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 int kind1, kind2, kind;
9731 void *buf1, *buf2;
9732 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009733 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009734
9735 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00009736 sub = PyUnicode_FromObject(element);
9737 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009738 PyErr_Format(PyExc_TypeError,
9739 "'in <string>' requires string as left operand, not %s",
9740 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009741 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 if (PyUnicode_READY(sub) == -1)
9744 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009745
Thomas Wouters477c8d52006-05-27 19:21:47 +00009746 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +02009747 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009748 Py_DECREF(sub);
9749 return -1;
9750 }
9751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 kind1 = PyUnicode_KIND(str);
9753 kind2 = PyUnicode_KIND(sub);
9754 kind = kind1 > kind2 ? kind1 : kind2;
9755 buf1 = PyUnicode_DATA(str);
9756 buf2 = PyUnicode_DATA(sub);
9757 if (kind1 != kind)
9758 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9759 if (!buf1) {
9760 Py_DECREF(sub);
9761 return -1;
9762 }
9763 if (kind2 != kind)
9764 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9765 if (!buf2) {
9766 Py_DECREF(sub);
9767 if (kind1 != kind) PyMem_Free(buf1);
9768 return -1;
9769 }
9770 len1 = PyUnicode_GET_LENGTH(str);
9771 len2 = PyUnicode_GET_LENGTH(sub);
9772
9773 switch(kind) {
9774 case PyUnicode_1BYTE_KIND:
9775 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9776 break;
9777 case PyUnicode_2BYTE_KIND:
9778 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9779 break;
9780 case PyUnicode_4BYTE_KIND:
9781 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9782 break;
9783 default:
9784 result = -1;
9785 assert(0);
9786 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009787
9788 Py_DECREF(str);
9789 Py_DECREF(sub);
9790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 if (kind1 != kind)
9792 PyMem_Free(buf1);
9793 if (kind2 != kind)
9794 PyMem_Free(buf2);
9795
Guido van Rossum403d68b2000-03-13 15:55:09 +00009796 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00009797}
9798
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799/* Concat to string or Unicode object giving a new Unicode object. */
9800
Alexander Belopolsky40018472011-02-26 01:02:56 +00009801PyObject *
9802PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 PyObject *u = NULL, *v = NULL, *w;
9805 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806
9807 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009810 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
9815 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +02009816 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009817 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 }
Victor Stinnera464fc12011-10-02 20:39:30 +02009820 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 }
9824
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +02009826 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 w = PyUnicode_New(
9830 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9831 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009832 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009834 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9835 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +02009836 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009837 v, 0,
9838 PyUnicode_GET_LENGTH(v)) < 0)
9839 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840 Py_DECREF(u);
9841 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843
Benjamin Peterson29060642009-01-31 22:14:21 +00009844 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009845 Py_XDECREF(u);
9846 Py_XDECREF(v);
9847 return NULL;
9848}
9849
Walter Dörwald1ab83302007-05-18 17:15:44 +00009850void
Victor Stinner23e56682011-10-03 03:54:37 +02009851PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +00009852{
Victor Stinner23e56682011-10-03 03:54:37 +02009853 PyObject *left, *res;
9854
9855 if (p_left == NULL) {
9856 if (!PyErr_Occurred())
9857 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 return;
9859 }
Victor Stinner23e56682011-10-03 03:54:37 +02009860 left = *p_left;
9861 if (right == NULL || !PyUnicode_Check(left)) {
9862 if (!PyErr_Occurred())
9863 PyErr_BadInternalCall();
9864 goto error;
9865 }
9866
9867 if (PyUnicode_CheckExact(left) && left != unicode_empty
9868 && PyUnicode_CheckExact(right) && right != unicode_empty
9869 && unicode_resizable(left)
9870 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9871 || _PyUnicode_WSTR(left) != NULL))
9872 {
9873 Py_ssize_t u_len, v_len, new_len, copied;
9874
9875 /* FIXME: don't make wstr string ready */
9876 if (PyUnicode_READY(left))
9877 goto error;
9878 if (PyUnicode_READY(right))
9879 goto error;
9880
9881 /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9882 if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9883 {
9884 u_len = PyUnicode_GET_LENGTH(left);
9885 v_len = PyUnicode_GET_LENGTH(right);
9886 if (u_len > PY_SSIZE_T_MAX - v_len) {
9887 PyErr_SetString(PyExc_OverflowError,
9888 "strings are too large to concat");
9889 goto error;
9890 }
9891 new_len = u_len + v_len;
9892
9893 /* Now we own the last reference to 'left', so we can resize it
9894 * in-place.
9895 */
9896 if (unicode_resize(&left, new_len) != 0) {
9897 /* XXX if _PyUnicode_Resize() fails, 'left' has been
9898 * deallocated so it cannot be put back into
9899 * 'variable'. The MemoryError is raised when there
9900 * is no value in 'variable', which might (very
9901 * remotely) be a cause of incompatibilities.
9902 */
9903 goto error;
9904 }
9905 /* copy 'right' into the newly allocated area of 'left' */
9906 copied = PyUnicode_CopyCharacters(left, u_len,
9907 right, 0,
9908 v_len);
9909 assert(0 <= copied);
9910 *p_left = left;
9911 return;
9912 }
9913 }
9914
9915 res = PyUnicode_Concat(left, right);
9916 if (res == NULL)
9917 goto error;
9918 Py_DECREF(left);
9919 *p_left = res;
9920 return;
9921
9922error:
9923 Py_DECREF(*p_left);
9924 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +00009925}
9926
9927void
9928PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
9929{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 PyUnicode_Append(pleft, right);
9931 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00009932}
9933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009934PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00009937Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009938string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009939interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940
9941static PyObject *
9942unicode_count(PyUnicodeObject *self, PyObject *args)
9943{
9944 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009945 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009946 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 int kind1, kind2, kind;
9949 void *buf1, *buf2;
9950 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951
Jesus Ceaac451502011-04-20 17:09:23 +02009952 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
9953 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00009955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 kind1 = PyUnicode_KIND(self);
9957 kind2 = PyUnicode_KIND(substring);
9958 kind = kind1 > kind2 ? kind1 : kind2;
9959 buf1 = PyUnicode_DATA(self);
9960 buf2 = PyUnicode_DATA(substring);
9961 if (kind1 != kind)
9962 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9963 if (!buf1) {
9964 Py_DECREF(substring);
9965 return NULL;
9966 }
9967 if (kind2 != kind)
9968 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9969 if (!buf2) {
9970 Py_DECREF(substring);
9971 if (kind1 != kind) PyMem_Free(buf1);
9972 return NULL;
9973 }
9974 len1 = PyUnicode_GET_LENGTH(self);
9975 len2 = PyUnicode_GET_LENGTH(substring);
9976
9977 ADJUST_INDICES(start, end, len1);
9978 switch(kind) {
9979 case PyUnicode_1BYTE_KIND:
9980 iresult = ucs1lib_count(
9981 ((Py_UCS1*)buf1) + start, end - start,
9982 buf2, len2, PY_SSIZE_T_MAX
9983 );
9984 break;
9985 case PyUnicode_2BYTE_KIND:
9986 iresult = ucs2lib_count(
9987 ((Py_UCS2*)buf1) + start, end - start,
9988 buf2, len2, PY_SSIZE_T_MAX
9989 );
9990 break;
9991 case PyUnicode_4BYTE_KIND:
9992 iresult = ucs4lib_count(
9993 ((Py_UCS4*)buf1) + start, end - start,
9994 buf2, len2, PY_SSIZE_T_MAX
9995 );
9996 break;
9997 default:
9998 assert(0); iresult = 0;
9999 }
10000
10001 result = PyLong_FromSsize_t(iresult);
10002
10003 if (kind1 != kind)
10004 PyMem_Free(buf1);
10005 if (kind2 != kind)
10006 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007
10008 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010009
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010 return result;
10011}
10012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010013PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010014 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010016Encode S using the codec registered for encoding. Default encoding\n\
10017is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010018handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010019a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10020'xmlcharrefreplace' as well as any other name registered with\n\
10021codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022
10023static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010024unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010026 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 char *encoding = NULL;
10028 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010029
Benjamin Peterson308d6372009-09-18 21:42:35 +000010030 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10031 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010033 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010034}
10035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010036PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010037 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038\n\
10039Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010040If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041
10042static PyObject*
10043unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10044{
10045 Py_UNICODE *e;
10046 Py_UNICODE *p;
10047 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010048 Py_UNICODE *qe;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 Py_ssize_t i, j, incr, wstr_length;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050 PyUnicodeObject *u;
10051 int tabsize = 8;
10052
10053 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10057 return NULL;
10058
Thomas Wouters7e474022000-07-16 12:04:32 +000010059 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010060 i = 0; /* chars up to and including most recent \n or \r */
10061 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10063 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 if (tabsize > 0) {
10066 incr = tabsize - (j % tabsize); /* cannot overflow */
10067 if (j > PY_SSIZE_T_MAX - incr)
10068 goto overflow1;
10069 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010070 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 if (j > PY_SSIZE_T_MAX - 1)
10074 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010075 j++;
10076 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 if (i > PY_SSIZE_T_MAX - j)
10078 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010080 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081 }
10082 }
10083
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010084 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010086
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087 /* Second pass: create output string and fill it */
10088 u = _PyUnicode_New(i + j);
10089 if (!u)
10090 return NULL;
10091
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010092 j = 0; /* same as in first pass */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 q = _PyUnicode_WSTR(u); /* next output char */
10094 qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 for (p = _PyUnicode_WSTR(self); p < e; p++)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 if (tabsize > 0) {
10099 i = tabsize - (j % tabsize);
10100 j += i;
10101 while (i--) {
10102 if (q >= qe)
10103 goto overflow2;
10104 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010105 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010106 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010107 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 else {
10109 if (q >= qe)
10110 goto overflow2;
10111 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010112 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113 if (*p == '\n' || *p == '\r')
10114 j = 0;
10115 }
10116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (PyUnicode_READY(u) == -1) {
10118 Py_DECREF(u);
10119 return NULL;
10120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010122
10123 overflow2:
10124 Py_DECREF(u);
10125 overflow1:
10126 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10127 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010128}
10129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010130PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132\n\
10133Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010134such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135arguments start and end are interpreted as in slice notation.\n\
10136\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010137Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
10139static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141{
Jesus Ceaac451502011-04-20 17:09:23 +020010142 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010143 Py_ssize_t start;
10144 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010145 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
Jesus Ceaac451502011-04-20 17:09:23 +020010147 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10148 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (PyUnicode_READY(self) == -1)
10152 return NULL;
10153 if (PyUnicode_READY(substring) == -1)
10154 return NULL;
10155
10156 result = any_find_slice(
10157 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10158 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (result == -2)
10164 return NULL;
10165
Christian Heimes217cfd12007-12-02 14:31:20 +000010166 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167}
10168
10169static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010170unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010172 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10173 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176}
10177
Guido van Rossumc2504932007-09-18 19:42:40 +000010178/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010179 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010180static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010181unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182{
Guido van Rossumc2504932007-09-18 19:42:40 +000010183 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010184 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (_PyUnicode_HASH(self) != -1)
10187 return _PyUnicode_HASH(self);
10188 if (PyUnicode_READY(self) == -1)
10189 return -1;
10190 len = PyUnicode_GET_LENGTH(self);
10191
10192 /* The hash function as a macro, gets expanded three times below. */
10193#define HASH(P) \
10194 x = (Py_uhash_t)*P << 7; \
10195 while (--len >= 0) \
10196 x = (1000003*x) ^ (Py_uhash_t)*P++;
10197
10198 switch (PyUnicode_KIND(self)) {
10199 case PyUnicode_1BYTE_KIND: {
10200 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10201 HASH(c);
10202 break;
10203 }
10204 case PyUnicode_2BYTE_KIND: {
10205 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10206 HASH(s);
10207 break;
10208 }
10209 default: {
10210 Py_UCS4 *l;
10211 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10212 "Impossible switch case in unicode_hash");
10213 l = PyUnicode_4BYTE_DATA(self);
10214 HASH(l);
10215 break;
10216 }
10217 }
10218 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10219
Guido van Rossumc2504932007-09-18 19:42:40 +000010220 if (x == -1)
10221 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010223 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010227PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
10232static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010235 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010236 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010237 Py_ssize_t start;
10238 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Jesus Ceaac451502011-04-20 17:09:23 +020010240 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10241 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 if (PyUnicode_READY(self) == -1)
10245 return NULL;
10246 if (PyUnicode_READY(substring) == -1)
10247 return NULL;
10248
10249 result = any_find_slice(
10250 ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10251 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
10254 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (result == -2)
10257 return NULL;
10258
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 if (result < 0) {
10260 PyErr_SetString(PyExc_ValueError, "substring not found");
10261 return NULL;
10262 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010263
Christian Heimes217cfd12007-12-02 14:31:20 +000010264 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265}
10266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010267PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010270Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010271at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
10273static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010274unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 Py_ssize_t i, length;
10277 int kind;
10278 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279 int cased;
10280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (PyUnicode_READY(self) == -1)
10282 return NULL;
10283 length = PyUnicode_GET_LENGTH(self);
10284 kind = PyUnicode_KIND(self);
10285 data = PyUnicode_DATA(self);
10286
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (length == 1)
10289 return PyBool_FromLong(
10290 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010292 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010294 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010295
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 for (i = 0; i < length; i++) {
10298 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010299
Benjamin Peterson29060642009-01-31 22:14:21 +000010300 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10301 return PyBool_FromLong(0);
10302 else if (!cased && Py_UNICODE_ISLOWER(ch))
10303 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010305 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010306}
10307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010308PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010309 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010311Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010312at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313
10314static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010315unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 Py_ssize_t i, length;
10318 int kind;
10319 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320 int cased;
10321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (PyUnicode_READY(self) == -1)
10323 return NULL;
10324 length = PyUnicode_GET_LENGTH(self);
10325 kind = PyUnicode_KIND(self);
10326 data = PyUnicode_DATA(self);
10327
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 if (length == 1)
10330 return PyBool_FromLong(
10331 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010333 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010335 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010336
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 for (i = 0; i < length; i++) {
10339 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010340
Benjamin Peterson29060642009-01-31 22:14:21 +000010341 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10342 return PyBool_FromLong(0);
10343 else if (!cased && Py_UNICODE_ISUPPER(ch))
10344 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010346 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347}
10348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010349PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010350 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010352Return True if S is a titlecased string and there is at least one\n\
10353character in S, i.e. upper- and titlecase characters may only\n\
10354follow uncased characters and lowercase characters only cased ones.\n\
10355Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010356
10357static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010358unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 Py_ssize_t i, length;
10361 int kind;
10362 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 int cased, previous_is_cased;
10364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (PyUnicode_READY(self) == -1)
10366 return NULL;
10367 length = PyUnicode_GET_LENGTH(self);
10368 kind = PyUnicode_KIND(self);
10369 data = PyUnicode_DATA(self);
10370
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 if (length == 1) {
10373 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10374 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10375 (Py_UNICODE_ISUPPER(ch) != 0));
10376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010378 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010380 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010381
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382 cased = 0;
10383 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 for (i = 0; i < length; i++) {
10385 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010386
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10388 if (previous_is_cased)
10389 return PyBool_FromLong(0);
10390 previous_is_cased = 1;
10391 cased = 1;
10392 }
10393 else if (Py_UNICODE_ISLOWER(ch)) {
10394 if (!previous_is_cased)
10395 return PyBool_FromLong(0);
10396 previous_is_cased = 1;
10397 cased = 1;
10398 }
10399 else
10400 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010402 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403}
10404
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010405PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010408Return True if all characters in S are whitespace\n\
10409and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
10411static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010412unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 Py_ssize_t i, length;
10415 int kind;
10416 void *data;
10417
10418 if (PyUnicode_READY(self) == -1)
10419 return NULL;
10420 length = PyUnicode_GET_LENGTH(self);
10421 kind = PyUnicode_KIND(self);
10422 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 if (length == 1)
10426 return PyBool_FromLong(
10427 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010429 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 for (i = 0; i < length; i++) {
10434 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010435 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010436 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010438 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439}
10440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010441PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010443\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010444Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010445and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010446
10447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010448unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 Py_ssize_t i, length;
10451 int kind;
10452 void *data;
10453
10454 if (PyUnicode_READY(self) == -1)
10455 return NULL;
10456 length = PyUnicode_GET_LENGTH(self);
10457 kind = PyUnicode_KIND(self);
10458 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010459
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010460 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (length == 1)
10462 return PyBool_FromLong(
10463 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010464
10465 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 for (i = 0; i < length; i++) {
10470 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010472 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010473 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010474}
10475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010476PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010477 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010478\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010479Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010480and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010481
10482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010483unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 int kind;
10486 void *data;
10487 Py_ssize_t len, i;
10488
10489 if (PyUnicode_READY(self) == -1)
10490 return NULL;
10491
10492 kind = PyUnicode_KIND(self);
10493 data = PyUnicode_DATA(self);
10494 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010495
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010496 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (len == 1) {
10498 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10499 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10500 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010501
10502 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 for (i = 0; i < len; i++) {
10507 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010508 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010509 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010510 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010511 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010512}
10513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010514PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010515 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010517Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010518False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519
10520static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010521unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 Py_ssize_t i, length;
10524 int kind;
10525 void *data;
10526
10527 if (PyUnicode_READY(self) == -1)
10528 return NULL;
10529 length = PyUnicode_GET_LENGTH(self);
10530 kind = PyUnicode_KIND(self);
10531 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (length == 1)
10535 return PyBool_FromLong(
10536 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010538 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 for (i = 0; i < length; i++) {
10543 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010546 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547}
10548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010549PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010552Return True if all characters in S are digits\n\
10553and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
10555static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010556unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 Py_ssize_t i, length;
10559 int kind;
10560 void *data;
10561
10562 if (PyUnicode_READY(self) == -1)
10563 return NULL;
10564 length = PyUnicode_GET_LENGTH(self);
10565 kind = PyUnicode_KIND(self);
10566 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (length == 1) {
10570 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10571 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010573
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010574 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 for (i = 0; i < length; i++) {
10579 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010580 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010582 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583}
10584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010585PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010586 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010588Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010589False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
10591static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010592unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 Py_ssize_t i, length;
10595 int kind;
10596 void *data;
10597
10598 if (PyUnicode_READY(self) == -1)
10599 return NULL;
10600 length = PyUnicode_GET_LENGTH(self);
10601 kind = PyUnicode_KIND(self);
10602 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (length == 1)
10606 return PyBool_FromLong(
10607 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010609 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 for (i = 0; i < length; i++) {
10614 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010617 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010618}
10619
Martin v. Löwis47383402007-08-15 07:32:56 +000010620int
10621PyUnicode_IsIdentifier(PyObject *self)
10622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 int kind;
10624 void *data;
10625 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010626 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (PyUnicode_READY(self) == -1) {
10629 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 }
10632
10633 /* Special case for empty strings */
10634 if (PyUnicode_GET_LENGTH(self) == 0)
10635 return 0;
10636 kind = PyUnicode_KIND(self);
10637 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010638
10639 /* PEP 3131 says that the first character must be in
10640 XID_Start and subsequent characters in XID_Continue,
10641 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000010642 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000010643 letters, digits, underscore). However, given the current
10644 definition of XID_Start and XID_Continue, it is sufficient
10645 to check just for these, except that _ must be allowed
10646 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050010648 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000010649 return 0;
10650
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040010651 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010653 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000010654 return 1;
10655}
10656
10657PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000010659\n\
10660Return True if S is a valid identifier according\n\
10661to the language definition.");
10662
10663static PyObject*
10664unicode_isidentifier(PyObject *self)
10665{
10666 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10667}
10668
Georg Brandl559e5d72008-06-11 18:37:52 +000010669PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010670 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000010671\n\
10672Return True if all characters in S are considered\n\
10673printable in repr() or S is empty, False otherwise.");
10674
10675static PyObject*
10676unicode_isprintable(PyObject *self)
10677{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 Py_ssize_t i, length;
10679 int kind;
10680 void *data;
10681
10682 if (PyUnicode_READY(self) == -1)
10683 return NULL;
10684 length = PyUnicode_GET_LENGTH(self);
10685 kind = PyUnicode_KIND(self);
10686 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000010687
10688 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 if (length == 1)
10690 return PyBool_FromLong(
10691 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000010692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 for (i = 0; i < length; i++) {
10694 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000010695 Py_RETURN_FALSE;
10696 }
10697 }
10698 Py_RETURN_TRUE;
10699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000010702 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
10704Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000010705iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010706
10707static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010708unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010710 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711}
10712
Martin v. Löwis18e16552006-02-15 17:27:45 +000010713static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714unicode_length(PyUnicodeObject *self)
10715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 if (PyUnicode_READY(self) == -1)
10717 return -1;
10718 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719}
10720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000010724Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010725done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727static PyObject *
10728unicode_ljust(PyUnicodeObject *self, PyObject *args)
10729{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010730 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 Py_UCS4 fillchar = ' ';
10732
10733 if (PyUnicode_READY(self) == -1)
10734 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010735
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010736 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 return NULL;
10738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 Py_INCREF(self);
10741 return (PyObject*) self;
10742 }
10743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010744 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745}
10746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010747PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010748 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010750Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751
10752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010753unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 return fixup(self, fixlower);
10756}
10757
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010758#define LEFTSTRIP 0
10759#define RIGHTSTRIP 1
10760#define BOTHSTRIP 2
10761
10762/* Arrays indexed by above */
10763static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10764
10765#define STRIPNAME(i) (stripformat[i]+3)
10766
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010767/* externally visible for str.strip(unicode) */
10768PyObject *
10769_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10770{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 void *data;
10772 int kind;
10773 Py_ssize_t i, j, len;
10774 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10777 return NULL;
10778
10779 kind = PyUnicode_KIND(self);
10780 data = PyUnicode_DATA(self);
10781 len = PyUnicode_GET_LENGTH(self);
10782 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10783 PyUnicode_DATA(sepobj),
10784 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010785
Benjamin Peterson14339b62009-01-31 16:36:08 +000010786 i = 0;
10787 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 while (i < len &&
10789 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 i++;
10791 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010792 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010793
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 j = len;
10795 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 do {
10797 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 } while (j >= i &&
10799 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010801 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010802
Victor Stinner12bab6d2011-10-01 01:53:49 +020010803 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804}
10805
10806PyObject*
10807PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10808{
10809 unsigned char *data;
10810 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020010811 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812
Victor Stinnerde636f32011-10-01 03:55:54 +020010813 if (PyUnicode_READY(self) == -1)
10814 return NULL;
10815
10816 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10817
Victor Stinner12bab6d2011-10-01 01:53:49 +020010818 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010820 if (PyUnicode_CheckExact(self)) {
10821 Py_INCREF(self);
10822 return self;
10823 }
10824 else
10825 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 }
10827
Victor Stinner12bab6d2011-10-01 01:53:49 +020010828 length = end - start;
10829 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010830 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010831
Victor Stinnerde636f32011-10-01 03:55:54 +020010832 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020010833 PyErr_SetString(PyExc_IndexError, "string index out of range");
10834 return NULL;
10835 }
10836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 kind = PyUnicode_KIND(self);
10838 data = PyUnicode_1BYTE_DATA(self);
Victor Stinner034f6cf2011-09-30 02:26:44 +020010839 return PyUnicode_FromKindAndData(kind,
10840 data + PyUnicode_KIND_SIZE(kind, start),
Victor Stinner12bab6d2011-10-01 01:53:49 +020010841 length);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842}
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
10844static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010845do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 int kind;
10848 void *data;
10849 Py_ssize_t len, i, j;
10850
10851 if (PyUnicode_READY(self) == -1)
10852 return NULL;
10853
10854 kind = PyUnicode_KIND(self);
10855 data = PyUnicode_DATA(self);
10856 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010857
Benjamin Peterson14339b62009-01-31 16:36:08 +000010858 i = 0;
10859 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010861 i++;
10862 }
10863 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010864
Benjamin Peterson14339b62009-01-31 16:36:08 +000010865 j = len;
10866 if (striptype != LEFTSTRIP) {
10867 do {
10868 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010870 j++;
10871 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010872
Victor Stinner12bab6d2011-10-01 01:53:49 +020010873 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874}
10875
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010876
10877static PyObject *
10878do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10879{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010880 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010881
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10883 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010884
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885 if (sep != NULL && sep != Py_None) {
10886 if (PyUnicode_Check(sep))
10887 return _PyUnicode_XStrip(self, striptype, sep);
10888 else {
10889 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 "%s arg must be None or str",
10891 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892 return NULL;
10893 }
10894 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010895
Benjamin Peterson14339b62009-01-31 16:36:08 +000010896 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010897}
10898
10899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010900PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010902\n\
10903Return a copy of the string S with leading and trailing\n\
10904whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010905If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010906
10907static PyObject *
10908unicode_strip(PyUnicodeObject *self, PyObject *args)
10909{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010910 if (PyTuple_GET_SIZE(args) == 0)
10911 return do_strip(self, BOTHSTRIP); /* Common case */
10912 else
10913 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010914}
10915
10916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010917PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010918 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010919\n\
10920Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010921If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010922
10923static PyObject *
10924unicode_lstrip(PyUnicodeObject *self, PyObject *args)
10925{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010926 if (PyTuple_GET_SIZE(args) == 0)
10927 return do_strip(self, LEFTSTRIP); /* Common case */
10928 else
10929 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010930}
10931
10932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010933PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010934 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010935\n\
10936Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010937If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010938
10939static PyObject *
10940unicode_rstrip(PyUnicodeObject *self, PyObject *args)
10941{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010942 if (PyTuple_GET_SIZE(args) == 0)
10943 return do_strip(self, RIGHTSTRIP); /* Common case */
10944 else
10945 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000010946}
10947
10948
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000010950unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
10952 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Georg Brandl222de0f2009-04-12 12:01:50 +000010955 if (len < 1) {
10956 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020010957 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000010958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959
Tim Peters7a29bd52001-09-12 03:03:31 +000010960 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961 /* no repeat, return original string */
10962 Py_INCREF(str);
10963 return (PyObject*) str;
10964 }
Tim Peters8f422462000-09-09 06:13:41 +000010965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (PyUnicode_READY(str) == -1)
10967 return NULL;
10968
Victor Stinnerc759f3e2011-10-01 03:09:58 +020010969 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020010970 PyErr_SetString(PyExc_OverflowError,
10971 "repeated string is too long");
10972 return NULL;
10973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010974 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 if (!u)
10978 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020010979 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 if (PyUnicode_GET_LENGTH(str) == 1) {
10982 const int kind = PyUnicode_KIND(str);
10983 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
10984 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020010985 if (kind == PyUnicode_1BYTE_KIND)
10986 memset(to, (unsigned char)fill_char, len);
10987 else {
10988 for (n = 0; n < len; ++n)
10989 PyUnicode_WRITE(kind, to, n, fill_char);
10990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 }
10992 else {
10993 /* number of characters copied this far */
10994 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
10995 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
10996 char *to = (char *) PyUnicode_DATA(u);
10997 Py_MEMCPY(to, PyUnicode_DATA(str),
10998 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 n = (done <= nchars-done) ? done : nchars-done;
11001 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011002 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 }
11005
11006 return (PyObject*) u;
11007}
11008
Alexander Belopolsky40018472011-02-26 01:02:56 +000011009PyObject *
11010PyUnicode_Replace(PyObject *obj,
11011 PyObject *subobj,
11012 PyObject *replobj,
11013 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014{
11015 PyObject *self;
11016 PyObject *str1;
11017 PyObject *str2;
11018 PyObject *result;
11019
11020 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011021 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011024 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 Py_DECREF(self);
11026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 }
11028 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011029 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 Py_DECREF(self);
11031 Py_DECREF(str1);
11032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 Py_DECREF(self);
11036 Py_DECREF(str1);
11037 Py_DECREF(str2);
11038 return result;
11039}
11040
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011041PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011042 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043\n\
11044Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011045old replaced by new. If the optional argument count is\n\
11046given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047
11048static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 PyObject *str1;
11052 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011053 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054 PyObject *result;
11055
Martin v. Löwis18e16552006-02-15 17:27:45 +000011056 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 str1 = PyUnicode_FromObject(str1);
11061 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11062 return NULL;
11063 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011064 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011065 Py_DECREF(str1);
11066 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
11069 result = replace(self, str1, str2, maxcount);
11070
11071 Py_DECREF(str1);
11072 Py_DECREF(str2);
11073 return result;
11074}
11075
Alexander Belopolsky40018472011-02-26 01:02:56 +000011076static PyObject *
11077unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011079 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 Py_ssize_t isize;
11081 Py_ssize_t osize, squote, dquote, i, o;
11082 Py_UCS4 max, quote;
11083 int ikind, okind;
11084 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011087 return NULL;
11088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 isize = PyUnicode_GET_LENGTH(unicode);
11090 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 /* Compute length of output, quote characters, and
11093 maximum character */
11094 osize = 2; /* quotes */
11095 max = 127;
11096 squote = dquote = 0;
11097 ikind = PyUnicode_KIND(unicode);
11098 for (i = 0; i < isize; i++) {
11099 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11100 switch (ch) {
11101 case '\'': squote++; osize++; break;
11102 case '"': dquote++; osize++; break;
11103 case '\\': case '\t': case '\r': case '\n':
11104 osize += 2; break;
11105 default:
11106 /* Fast-path ASCII */
11107 if (ch < ' ' || ch == 0x7f)
11108 osize += 4; /* \xHH */
11109 else if (ch < 0x7f)
11110 osize++;
11111 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11112 osize++;
11113 max = ch > max ? ch : max;
11114 }
11115 else if (ch < 0x100)
11116 osize += 4; /* \xHH */
11117 else if (ch < 0x10000)
11118 osize += 6; /* \uHHHH */
11119 else
11120 osize += 10; /* \uHHHHHHHH */
11121 }
11122 }
11123
11124 quote = '\'';
11125 if (squote) {
11126 if (dquote)
11127 /* Both squote and dquote present. Use squote,
11128 and escape them */
11129 osize += squote;
11130 else
11131 quote = '"';
11132 }
11133
11134 repr = PyUnicode_New(osize, max);
11135 if (repr == NULL)
11136 return NULL;
11137 okind = PyUnicode_KIND(repr);
11138 odata = PyUnicode_DATA(repr);
11139
11140 PyUnicode_WRITE(okind, odata, 0, quote);
11141 PyUnicode_WRITE(okind, odata, osize-1, quote);
11142
11143 for (i = 0, o = 1; i < isize; i++) {
11144 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011145
11146 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 if ((ch == quote) || (ch == '\\')) {
11148 PyUnicode_WRITE(okind, odata, o++, '\\');
11149 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011150 continue;
11151 }
11152
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011154 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 PyUnicode_WRITE(okind, odata, o++, '\\');
11156 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011157 }
11158 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 PyUnicode_WRITE(okind, odata, o++, '\\');
11160 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011161 }
11162 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 PyUnicode_WRITE(okind, odata, o++, '\\');
11164 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011165 }
11166
11167 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011168 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 PyUnicode_WRITE(okind, odata, o++, '\\');
11170 PyUnicode_WRITE(okind, odata, o++, 'x');
11171 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11172 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011173 }
11174
Georg Brandl559e5d72008-06-11 18:37:52 +000011175 /* Copy ASCII characters as-is */
11176 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011178 }
11179
Benjamin Peterson29060642009-01-31 22:14:21 +000011180 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011181 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011182 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011183 (categories Z* and C* except ASCII space)
11184 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011186 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 if (ch <= 0xff) {
11188 PyUnicode_WRITE(okind, odata, o++, '\\');
11189 PyUnicode_WRITE(okind, odata, o++, 'x');
11190 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11191 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011192 }
11193 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 else if (ch >= 0x10000) {
11195 PyUnicode_WRITE(okind, odata, o++, '\\');
11196 PyUnicode_WRITE(okind, odata, o++, 'U');
11197 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11198 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11199 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11200 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11201 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11202 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11203 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11204 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011205 }
11206 /* Map 16-bit characters to '\uxxxx' */
11207 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 PyUnicode_WRITE(okind, odata, o++, '\\');
11209 PyUnicode_WRITE(okind, odata, o++, 'u');
11210 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11211 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11212 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11213 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011214 }
11215 }
11216 /* Copy characters as-is */
11217 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011219 }
11220 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011222 /* Closing quote already added at the beginning */
Walter Dörwald79e913e2007-05-12 11:08:06 +000011223 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
11229Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011230such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231arguments start and end are interpreted as in slice notation.\n\
11232\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011233Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234
11235static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237{
Jesus Ceaac451502011-04-20 17:09:23 +020011238 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011239 Py_ssize_t start;
11240 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011241 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242
Jesus Ceaac451502011-04-20 17:09:23 +020011243 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11244 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011245 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (PyUnicode_READY(self) == -1)
11248 return NULL;
11249 if (PyUnicode_READY(substring) == -1)
11250 return NULL;
11251
11252 result = any_find_slice(
11253 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11254 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011255 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
11257 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 if (result == -2)
11260 return NULL;
11261
Christian Heimes217cfd12007-12-02 14:31:20 +000011262 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263}
11264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
11270static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272{
Jesus Ceaac451502011-04-20 17:09:23 +020011273 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011274 Py_ssize_t start;
11275 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011276 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
Jesus Ceaac451502011-04-20 17:09:23 +020011278 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11279 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011282 if (PyUnicode_READY(self) == -1)
11283 return NULL;
11284 if (PyUnicode_READY(substring) == -1)
11285 return NULL;
11286
11287 result = any_find_slice(
11288 ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11289 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011290 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
11292 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011293
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 if (result == -2)
11295 return NULL;
11296
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297 if (result < 0) {
11298 PyErr_SetString(PyExc_ValueError, "substring not found");
11299 return NULL;
11300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301
Christian Heimes217cfd12007-12-02 14:31:20 +000011302 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011308Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011309done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310
11311static PyObject *
11312unicode_rjust(PyUnicodeObject *self, PyObject *args)
11313{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011314 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 Py_UCS4 fillchar = ' ';
11316
Victor Stinnere9a29352011-10-01 02:14:59 +020011317 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011319
Victor Stinnere9a29352011-10-01 02:14:59 +020011320 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321 return NULL;
11322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 Py_INCREF(self);
11325 return (PyObject*) self;
11326 }
11327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Alexander Belopolsky40018472011-02-26 01:02:56 +000011331PyObject *
11332PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333{
11334 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011335
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336 s = PyUnicode_FromObject(s);
11337 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011338 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 if (sep != NULL) {
11340 sep = PyUnicode_FromObject(sep);
11341 if (sep == NULL) {
11342 Py_DECREF(s);
11343 return NULL;
11344 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
11346
11347 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11348
11349 Py_DECREF(s);
11350 Py_XDECREF(sep);
11351 return result;
11352}
11353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011354PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356\n\
11357Return a list of the words in S, using sep as the\n\
11358delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011359splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011360whitespace string is a separator and empty strings are\n\
11361removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
11363static PyObject*
11364unicode_split(PyUnicodeObject *self, PyObject *args)
11365{
11366 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011367 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Martin v. Löwis18e16552006-02-15 17:27:45 +000011369 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 return NULL;
11371
11372 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011373 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378}
11379
Thomas Wouters477c8d52006-05-27 19:21:47 +000011380PyObject *
11381PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11382{
11383 PyObject* str_obj;
11384 PyObject* sep_obj;
11385 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 int kind1, kind2, kind;
11387 void *buf1 = NULL, *buf2 = NULL;
11388 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011389
11390 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011391 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011393 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011395 Py_DECREF(str_obj);
11396 return NULL;
11397 }
11398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011399 kind1 = PyUnicode_KIND(str_in);
11400 kind2 = PyUnicode_KIND(sep_obj);
11401 kind = kind1 > kind2 ? kind1 : kind2;
11402 buf1 = PyUnicode_DATA(str_in);
11403 if (kind1 != kind)
11404 buf1 = _PyUnicode_AsKind(str_in, kind);
11405 if (!buf1)
11406 goto onError;
11407 buf2 = PyUnicode_DATA(sep_obj);
11408 if (kind2 != kind)
11409 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11410 if (!buf2)
11411 goto onError;
11412 len1 = PyUnicode_GET_LENGTH(str_obj);
11413 len2 = PyUnicode_GET_LENGTH(sep_obj);
11414
11415 switch(PyUnicode_KIND(str_in)) {
11416 case PyUnicode_1BYTE_KIND:
11417 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11418 break;
11419 case PyUnicode_2BYTE_KIND:
11420 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11421 break;
11422 case PyUnicode_4BYTE_KIND:
11423 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11424 break;
11425 default:
11426 assert(0);
11427 out = 0;
11428 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429
11430 Py_DECREF(sep_obj);
11431 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (kind1 != kind)
11433 PyMem_Free(buf1);
11434 if (kind2 != kind)
11435 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011436
11437 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 onError:
11439 Py_DECREF(sep_obj);
11440 Py_DECREF(str_obj);
11441 if (kind1 != kind && buf1)
11442 PyMem_Free(buf1);
11443 if (kind2 != kind && buf2)
11444 PyMem_Free(buf2);
11445 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011446}
11447
11448
11449PyObject *
11450PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11451{
11452 PyObject* str_obj;
11453 PyObject* sep_obj;
11454 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 int kind1, kind2, kind;
11456 void *buf1 = NULL, *buf2 = NULL;
11457 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011458
11459 str_obj = PyUnicode_FromObject(str_in);
11460 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011462 sep_obj = PyUnicode_FromObject(sep_in);
11463 if (!sep_obj) {
11464 Py_DECREF(str_obj);
11465 return NULL;
11466 }
11467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 kind1 = PyUnicode_KIND(str_in);
11469 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011470 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 buf1 = PyUnicode_DATA(str_in);
11472 if (kind1 != kind)
11473 buf1 = _PyUnicode_AsKind(str_in, kind);
11474 if (!buf1)
11475 goto onError;
11476 buf2 = PyUnicode_DATA(sep_obj);
11477 if (kind2 != kind)
11478 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11479 if (!buf2)
11480 goto onError;
11481 len1 = PyUnicode_GET_LENGTH(str_obj);
11482 len2 = PyUnicode_GET_LENGTH(sep_obj);
11483
11484 switch(PyUnicode_KIND(str_in)) {
11485 case PyUnicode_1BYTE_KIND:
11486 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11487 break;
11488 case PyUnicode_2BYTE_KIND:
11489 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11490 break;
11491 case PyUnicode_4BYTE_KIND:
11492 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11493 break;
11494 default:
11495 assert(0);
11496 out = 0;
11497 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011498
11499 Py_DECREF(sep_obj);
11500 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 if (kind1 != kind)
11502 PyMem_Free(buf1);
11503 if (kind2 != kind)
11504 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011505
11506 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011507 onError:
11508 Py_DECREF(sep_obj);
11509 Py_DECREF(str_obj);
11510 if (kind1 != kind && buf1)
11511 PyMem_Free(buf1);
11512 if (kind2 != kind && buf2)
11513 PyMem_Free(buf2);
11514 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515}
11516
11517PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011518 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011519\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011520Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011521the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011522found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011523
11524static PyObject*
11525unicode_partition(PyUnicodeObject *self, PyObject *separator)
11526{
11527 return PyUnicode_Partition((PyObject *)self, separator);
11528}
11529
11530PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011531 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011532\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011533Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011534the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011535separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011536
11537static PyObject*
11538unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11539{
11540 return PyUnicode_RPartition((PyObject *)self, separator);
11541}
11542
Alexander Belopolsky40018472011-02-26 01:02:56 +000011543PyObject *
11544PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011545{
11546 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011547
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011548 s = PyUnicode_FromObject(s);
11549 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 if (sep != NULL) {
11552 sep = PyUnicode_FromObject(sep);
11553 if (sep == NULL) {
11554 Py_DECREF(s);
11555 return NULL;
11556 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011557 }
11558
11559 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11560
11561 Py_DECREF(s);
11562 Py_XDECREF(sep);
11563 return result;
11564}
11565
11566PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011568\n\
11569Return a list of the words in S, using sep as the\n\
11570delimiter string, starting at the end of the string and\n\
11571working to the front. If maxsplit is given, at most maxsplit\n\
11572splits are done. If sep is not specified, any whitespace string\n\
11573is a separator.");
11574
11575static PyObject*
11576unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11577{
11578 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011579 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011580
Martin v. Löwis18e16552006-02-15 17:27:45 +000011581 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011582 return NULL;
11583
11584 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011586 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011588 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011590}
11591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011592PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594\n\
11595Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011596Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011597is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
11599static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011600unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011602 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011603 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011605 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11606 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607 return NULL;
11608
Guido van Rossum86662912000-04-11 15:38:46 +000011609 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
11612static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011613PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Walter Dörwald346737f2007-05-31 10:44:43 +000011615 if (PyUnicode_CheckExact(self)) {
11616 Py_INCREF(self);
11617 return self;
11618 } else
11619 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011620 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625\n\
11626Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628
11629static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011630unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632 return fixup(self, fixswapcase);
11633}
11634
Georg Brandlceee0772007-11-27 23:48:05 +000011635PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011637\n\
11638Return a translation table usable for str.translate().\n\
11639If there is only one argument, it must be a dictionary mapping Unicode\n\
11640ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011641Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000011642If there are two arguments, they must be strings of equal length, and\n\
11643in the resulting dictionary, each character in x will be mapped to the\n\
11644character at the same position in y. If there is a third argument, it\n\
11645must be a string, whose characters will be mapped to None in the result.");
11646
11647static PyObject*
11648unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11649{
11650 PyObject *x, *y = NULL, *z = NULL;
11651 PyObject *new = NULL, *key, *value;
11652 Py_ssize_t i = 0;
11653 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654
Georg Brandlceee0772007-11-27 23:48:05 +000011655 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11656 return NULL;
11657 new = PyDict_New();
11658 if (!new)
11659 return NULL;
11660 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 int x_kind, y_kind, z_kind;
11662 void *x_data, *y_data, *z_data;
11663
Georg Brandlceee0772007-11-27 23:48:05 +000011664 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000011665 if (!PyUnicode_Check(x)) {
11666 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11667 "be a string if there is a second argument");
11668 goto err;
11669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011671 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11672 "arguments must have equal length");
11673 goto err;
11674 }
11675 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 x_kind = PyUnicode_KIND(x);
11677 y_kind = PyUnicode_KIND(y);
11678 x_data = PyUnicode_DATA(x);
11679 y_data = PyUnicode_DATA(y);
11680 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11681 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11682 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011683 if (!key || !value)
11684 goto err;
11685 res = PyDict_SetItem(new, key, value);
11686 Py_DECREF(key);
11687 Py_DECREF(value);
11688 if (res < 0)
11689 goto err;
11690 }
11691 /* create entries for deleting chars in z */
11692 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693 z_kind = PyUnicode_KIND(z);
11694 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000011695 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000011697 if (!key)
11698 goto err;
11699 res = PyDict_SetItem(new, key, Py_None);
11700 Py_DECREF(key);
11701 if (res < 0)
11702 goto err;
11703 }
11704 }
11705 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 int kind;
11707 void *data;
11708
Georg Brandlceee0772007-11-27 23:48:05 +000011709 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000011710 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011711 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11712 "to maketrans it must be a dict");
11713 goto err;
11714 }
11715 /* copy entries into the new dict, converting string keys to int keys */
11716 while (PyDict_Next(x, &i, &key, &value)) {
11717 if (PyUnicode_Check(key)) {
11718 /* convert string keys to integer keys */
11719 PyObject *newkey;
11720 if (PyUnicode_GET_SIZE(key) != 1) {
11721 PyErr_SetString(PyExc_ValueError, "string keys in translate "
11722 "table must be of length 1");
11723 goto err;
11724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 kind = PyUnicode_KIND(key);
11726 data = PyUnicode_DATA(key);
11727 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000011728 if (!newkey)
11729 goto err;
11730 res = PyDict_SetItem(new, newkey, value);
11731 Py_DECREF(newkey);
11732 if (res < 0)
11733 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000011734 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000011735 /* just keep integer keys */
11736 if (PyDict_SetItem(new, key, value) < 0)
11737 goto err;
11738 } else {
11739 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11740 "be strings or integers");
11741 goto err;
11742 }
11743 }
11744 }
11745 return new;
11746 err:
11747 Py_DECREF(new);
11748 return NULL;
11749}
11750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011751PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753\n\
11754Return a copy of the string S, where all characters have been mapped\n\
11755through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011756Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000011757Unmapped characters are left untouched. Characters mapped to None\n\
11758are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
11760static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764}
11765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
11771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011772unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774 return fixup(self, fixupper);
11775}
11776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000011780Pad a numeric string S with zeros on the left, to fill a field\n\
11781of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
11783static PyObject *
11784unicode_zfill(PyUnicodeObject *self, PyObject *args)
11785{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011786 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 PyUnicodeObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011788 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 int kind;
11790 void *data;
11791 Py_UCS4 chr;
11792
11793 if (PyUnicode_READY(self) == -1)
11794 return NULL;
11795
Martin v. Löwis18e16552006-02-15 17:27:45 +000011796 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 return NULL;
11798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000011800 if (PyUnicode_CheckExact(self)) {
11801 Py_INCREF(self);
11802 return (PyObject*) self;
11803 }
11804 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020011805 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 }
11807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
11810 u = pad(self, fill, 0, '0');
11811
Walter Dörwald068325e2002-04-15 13:36:47 +000011812 if (u == NULL)
11813 return NULL;
11814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 kind = PyUnicode_KIND(u);
11816 data = PyUnicode_DATA(u);
11817 chr = PyUnicode_READ(kind, data, fill);
11818
11819 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 PyUnicode_WRITE(kind, data, 0, chr);
11822 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 }
11824
11825 return (PyObject*) u;
11826}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
11828#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011829static PyObject *
11830unicode__decimal2ascii(PyObject *self)
11831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000011833}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834#endif
11835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011836PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011837 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011839Return True if S starts with the specified prefix, False otherwise.\n\
11840With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011841With optional end, stop comparing S at that position.\n\
11842prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843
11844static PyObject *
11845unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011848 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011850 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011851 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011852 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853
Jesus Ceaac451502011-04-20 17:09:23 +020011854 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011856 if (PyTuple_Check(subobj)) {
11857 Py_ssize_t i;
11858 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11859 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011861 if (substring == NULL)
11862 return NULL;
11863 result = tailmatch(self, substring, start, end, -1);
11864 Py_DECREF(substring);
11865 if (result) {
11866 Py_RETURN_TRUE;
11867 }
11868 }
11869 /* nothing matched */
11870 Py_RETURN_FALSE;
11871 }
11872 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011873 if (substring == NULL) {
11874 if (PyErr_ExceptionMatches(PyExc_TypeError))
11875 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11876 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011878 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011879 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011881 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882}
11883
11884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011885PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011886 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000011888Return True if S ends with the specified suffix, False otherwise.\n\
11889With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011890With optional end, stop comparing S at that position.\n\
11891suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
11893static PyObject *
11894unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000011895 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011897 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011899 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011900 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011901 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902
Jesus Ceaac451502011-04-20 17:09:23 +020011903 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011905 if (PyTuple_Check(subobj)) {
11906 Py_ssize_t i;
11907 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11908 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000011909 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011910 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011911 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011912 result = tailmatch(self, substring, start, end, +1);
11913 Py_DECREF(substring);
11914 if (result) {
11915 Py_RETURN_TRUE;
11916 }
11917 }
11918 Py_RETURN_FALSE;
11919 }
11920 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030011921 if (substring == NULL) {
11922 if (PyErr_ExceptionMatches(PyExc_TypeError))
11923 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
11924 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030011926 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011927 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000011929 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930}
11931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000011933
11934PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011936\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011937Return a formatted version of S, using substitutions from args and kwargs.\n\
11938The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000011939
Eric Smith27bbca62010-11-04 17:06:58 +000011940PyDoc_STRVAR(format_map__doc__,
11941 "S.format_map(mapping) -> str\n\
11942\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011943Return a formatted version of S, using substitutions from mapping.\n\
11944The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000011945
Eric Smith4a7d76d2008-05-30 18:10:19 +000011946static PyObject *
11947unicode__format__(PyObject* self, PyObject* args)
11948{
11949 PyObject *format_spec;
11950
11951 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
11952 return NULL;
11953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 return _PyUnicode_FormatAdvanced(self, format_spec, 0,
11955 PyUnicode_GET_LENGTH(format_spec));
Eric Smith4a7d76d2008-05-30 18:10:19 +000011956}
11957
Eric Smith8c663262007-08-25 02:26:07 +000011958PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000011960\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000011961Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000011962
11963static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011964unicode__sizeof__(PyUnicodeObject *v)
11965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 Py_ssize_t size;
11967
11968 /* If it's a compact object, account for base structure +
11969 character data. */
11970 if (PyUnicode_IS_COMPACT_ASCII(v))
11971 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
11972 else if (PyUnicode_IS_COMPACT(v))
11973 size = sizeof(PyCompactUnicodeObject) +
11974 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
11975 else {
11976 /* If it is a two-block object, account for base object, and
11977 for character block if present. */
11978 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020011979 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 size += (PyUnicode_GET_LENGTH(v) + 1) *
11981 PyUnicode_CHARACTER_SIZE(v);
11982 }
11983 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020011984 with the data pointer. Check if the data is not shared. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (_PyUnicode_WSTR(v) &&
Victor Stinnera3be6132011-10-03 02:16:37 +020011986 (PyUnicode_DATA(v) != _PyUnicode_WSTR(v)))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020011988 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020011989 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990
11991 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011992}
11993
11994PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000011996
11997static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020011998unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000011999{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012000 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (!copy)
12002 return NULL;
12003 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012004}
12005
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006static PyMethodDef unicode_methods[] = {
12007
12008 /* Order is according to common usage: often used methods should
12009 appear first, since lookup is done sequentially. */
12010
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012011 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012012 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12013 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012014 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012015 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12016 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12017 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12018 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12019 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12020 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12021 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012022 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012023 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12024 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12025 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012027 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12028 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12029 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012030 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012031 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012032 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012033 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012034 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12035 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12036 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12037 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12038 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12039 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12040 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12041 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12042 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12043 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12044 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12045 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12046 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12047 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012048 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012049 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012050 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012051 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012052 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012053 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012054 {"maketrans", (PyCFunction) unicode_maketrans,
12055 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012056 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012057#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012058 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059#endif
12060
12061#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012062 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012063 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064#endif
12065
Benjamin Peterson14339b62009-01-31 16:36:08 +000012066 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067 {NULL, NULL}
12068};
12069
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012070static PyObject *
12071unicode_mod(PyObject *v, PyObject *w)
12072{
Brian Curtindfc80e32011-08-10 20:28:54 -050012073 if (!PyUnicode_Check(v))
12074 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012076}
12077
12078static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 0, /*nb_add*/
12080 0, /*nb_subtract*/
12081 0, /*nb_multiply*/
12082 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012083};
12084
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012086 (lenfunc) unicode_length, /* sq_length */
12087 PyUnicode_Concat, /* sq_concat */
12088 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12089 (ssizeargfunc) unicode_getitem, /* sq_item */
12090 0, /* sq_slice */
12091 0, /* sq_ass_item */
12092 0, /* sq_ass_slice */
12093 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094};
12095
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012096static PyObject*
12097unicode_subscript(PyUnicodeObject* self, PyObject* item)
12098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (PyUnicode_READY(self) == -1)
12100 return NULL;
12101
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012102 if (PyIndex_Check(item)) {
12103 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012104 if (i == -1 && PyErr_Occurred())
12105 return NULL;
12106 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012108 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012109 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012110 Py_ssize_t start, stop, step, slicelength, cur, i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 const Py_UNICODE* source_buf;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012112 Py_UNICODE* result_buf;
12113 PyObject* result;
12114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012116 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012117 return NULL;
12118 }
12119
12120 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 return PyUnicode_New(0, 0);
12122 } else if (start == 0 && step == 1 &&
12123 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012124 PyUnicode_CheckExact(self)) {
12125 Py_INCREF(self);
12126 return (PyObject *)self;
12127 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012128 return PyUnicode_Substring((PyObject*)self,
12129 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012130 } else {
12131 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +000012132 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12133 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134
Benjamin Peterson29060642009-01-31 22:14:21 +000012135 if (result_buf == NULL)
12136 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012137
12138 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12139 result_buf[i] = source_buf[cur];
12140 }
Tim Petersced69f82003-09-16 20:30:58 +000012141
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012142 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +000012143 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012144 return result;
12145 }
12146 } else {
12147 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12148 return NULL;
12149 }
12150}
12151
12152static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012153 (lenfunc)unicode_length, /* mp_length */
12154 (binaryfunc)unicode_subscript, /* mp_subscript */
12155 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012156};
12157
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159/* Helpers for PyUnicode_Format() */
12160
12161static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012162getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012164 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012166 (*p_argidx)++;
12167 if (arglen < 0)
12168 return args;
12169 else
12170 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171 }
12172 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174 return NULL;
12175}
12176
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012177/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012179static PyObject *
12180formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012182 char *p;
12183 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012185
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 x = PyFloat_AsDouble(v);
12187 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012188 return NULL;
12189
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012191 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012192
Eric Smith0923d1d2009-04-16 20:16:10 +000012193 p = PyOS_double_to_string(x, type, prec,
12194 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012195 if (p == NULL)
12196 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012198 PyMem_Free(p);
12199 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200}
12201
Tim Peters38fd5b62000-09-21 05:43:11 +000012202static PyObject*
12203formatlong(PyObject *val, int flags, int prec, int type)
12204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012205 char *buf;
12206 int len;
12207 PyObject *str; /* temporary string object. */
12208 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012209
Benjamin Peterson14339b62009-01-31 16:36:08 +000012210 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12211 if (!str)
12212 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012214 Py_DECREF(str);
12215 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012216}
12217
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012220 size_t buflen,
12221 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012223 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012224 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (PyUnicode_GET_LENGTH(v) == 1) {
12226 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 buf[1] = '\0';
12228 return 1;
12229 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012230 goto onError;
12231 }
12232 else {
12233 /* Integer input truncated to a character */
12234 long x;
12235 x = PyLong_AsLong(v);
12236 if (x == -1 && PyErr_Occurred())
12237 goto onError;
12238
12239 if (x < 0 || x > 0x10ffff) {
12240 PyErr_SetString(PyExc_OverflowError,
12241 "%c arg not in range(0x110000)");
12242 return -1;
12243 }
12244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012246 buf[1] = '\0';
12247 return 1;
12248 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012249
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012251 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012253 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254}
12255
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012256/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012257 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012258*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012259#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012260
Alexander Belopolsky40018472011-02-26 01:02:56 +000012261PyObject *
12262PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 void *fmt;
12265 int fmtkind;
12266 PyObject *result;
12267 Py_UCS4 *res, *res0;
12268 Py_UCS4 max;
12269 int kind;
12270 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012274
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012276 PyErr_BadInternalCall();
12277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12280 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 fmt = PyUnicode_DATA(uformat);
12283 fmtkind = PyUnicode_KIND(uformat);
12284 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12285 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
12287 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12289 if (res0 == NULL) {
12290 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293
12294 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 arglen = PyTuple_Size(args);
12296 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 }
12298 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 arglen = -1;
12300 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012302 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012303 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 if (--rescnt < 0) {
12309 rescnt = fmtcnt + 100;
12310 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12312 if (res0 == NULL){
12313 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012314 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 }
12316 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 }
12321 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 /* Got a format specifier */
12323 int flags = 0;
12324 Py_ssize_t width = -1;
12325 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 Py_UCS4 c = '\0';
12327 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 int isnumok;
12329 PyObject *v = NULL;
12330 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 void *pbuf;
12332 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 Py_ssize_t len, len1;
12335 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 fmtpos++;
12338 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12339 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 Py_ssize_t keylen;
12341 PyObject *key;
12342 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012343
Benjamin Peterson29060642009-01-31 22:14:21 +000012344 if (dict == NULL) {
12345 PyErr_SetString(PyExc_TypeError,
12346 "format requires a mapping");
12347 goto onError;
12348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012352 /* Skip over balanced parentheses */
12353 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012361 if (fmtcnt < 0 || pcount > 0) {
12362 PyErr_SetString(PyExc_ValueError,
12363 "incomplete format key");
12364 goto onError;
12365 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012366 key = PyUnicode_Substring((PyObject*)uformat,
12367 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012368 if (key == NULL)
12369 goto onError;
12370 if (args_owned) {
12371 Py_DECREF(args);
12372 args_owned = 0;
12373 }
12374 args = PyObject_GetItem(dict, key);
12375 Py_DECREF(key);
12376 if (args == NULL) {
12377 goto onError;
12378 }
12379 args_owned = 1;
12380 arglen = -1;
12381 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012382 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012385 case '-': flags |= F_LJUST; continue;
12386 case '+': flags |= F_SIGN; continue;
12387 case ' ': flags |= F_BLANK; continue;
12388 case '#': flags |= F_ALT; continue;
12389 case '0': flags |= F_ZERO; continue;
12390 }
12391 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012392 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012393 if (c == '*') {
12394 v = getnextarg(args, arglen, &argidx);
12395 if (v == NULL)
12396 goto onError;
12397 if (!PyLong_Check(v)) {
12398 PyErr_SetString(PyExc_TypeError,
12399 "* wants int");
12400 goto onError;
12401 }
12402 width = PyLong_AsLong(v);
12403 if (width == -1 && PyErr_Occurred())
12404 goto onError;
12405 if (width < 0) {
12406 flags |= F_LJUST;
12407 width = -width;
12408 }
12409 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 }
12412 else if (c >= '0' && c <= '9') {
12413 width = c - '0';
12414 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012416 if (c < '0' || c > '9')
12417 break;
12418 if ((width*10) / 10 != width) {
12419 PyErr_SetString(PyExc_ValueError,
12420 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012421 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 }
12423 width = width*10 + (c - '0');
12424 }
12425 }
12426 if (c == '.') {
12427 prec = 0;
12428 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012430 if (c == '*') {
12431 v = getnextarg(args, arglen, &argidx);
12432 if (v == NULL)
12433 goto onError;
12434 if (!PyLong_Check(v)) {
12435 PyErr_SetString(PyExc_TypeError,
12436 "* wants int");
12437 goto onError;
12438 }
12439 prec = PyLong_AsLong(v);
12440 if (prec == -1 && PyErr_Occurred())
12441 goto onError;
12442 if (prec < 0)
12443 prec = 0;
12444 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 }
12447 else if (c >= '0' && c <= '9') {
12448 prec = c - '0';
12449 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012451 if (c < '0' || c > '9')
12452 break;
12453 if ((prec*10) / 10 != prec) {
12454 PyErr_SetString(PyExc_ValueError,
12455 "prec too big");
12456 goto onError;
12457 }
12458 prec = prec*10 + (c - '0');
12459 }
12460 }
12461 } /* prec */
12462 if (fmtcnt >= 0) {
12463 if (c == 'h' || c == 'l' || c == 'L') {
12464 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 }
12467 }
12468 if (fmtcnt < 0) {
12469 PyErr_SetString(PyExc_ValueError,
12470 "incomplete format");
12471 goto onError;
12472 }
12473 if (c != '%') {
12474 v = getnextarg(args, arglen, &argidx);
12475 if (v == NULL)
12476 goto onError;
12477 }
12478 sign = 0;
12479 fill = ' ';
12480 switch (c) {
12481
12482 case '%':
12483 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 len = 1;
12488 break;
12489
12490 case 's':
12491 case 'r':
12492 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012493 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012494 temp = v;
12495 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 }
12497 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 if (c == 's')
12499 temp = PyObject_Str(v);
12500 else if (c == 'r')
12501 temp = PyObject_Repr(v);
12502 else
12503 temp = PyObject_ASCII(v);
12504 if (temp == NULL)
12505 goto onError;
12506 if (PyUnicode_Check(temp))
12507 /* nothing to do */;
12508 else {
12509 Py_DECREF(temp);
12510 PyErr_SetString(PyExc_TypeError,
12511 "%s argument has non-string str()");
12512 goto onError;
12513 }
12514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 if (PyUnicode_READY(temp) == -1) {
12516 Py_CLEAR(temp);
12517 goto onError;
12518 }
12519 pbuf = PyUnicode_DATA(temp);
12520 kind = PyUnicode_KIND(temp);
12521 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 if (prec >= 0 && len > prec)
12523 len = prec;
12524 break;
12525
12526 case 'i':
12527 case 'd':
12528 case 'u':
12529 case 'o':
12530 case 'x':
12531 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 isnumok = 0;
12533 if (PyNumber_Check(v)) {
12534 PyObject *iobj=NULL;
12535
12536 if (PyLong_Check(v)) {
12537 iobj = v;
12538 Py_INCREF(iobj);
12539 }
12540 else {
12541 iobj = PyNumber_Long(v);
12542 }
12543 if (iobj!=NULL) {
12544 if (PyLong_Check(iobj)) {
12545 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012546 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012547 Py_DECREF(iobj);
12548 if (!temp)
12549 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (PyUnicode_READY(temp) == -1) {
12551 Py_CLEAR(temp);
12552 goto onError;
12553 }
12554 pbuf = PyUnicode_DATA(temp);
12555 kind = PyUnicode_KIND(temp);
12556 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 sign = 1;
12558 }
12559 else {
12560 Py_DECREF(iobj);
12561 }
12562 }
12563 }
12564 if (!isnumok) {
12565 PyErr_Format(PyExc_TypeError,
12566 "%%%c format: a number is required, "
12567 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12568 goto onError;
12569 }
12570 if (flags & F_ZERO)
12571 fill = '0';
12572 break;
12573
12574 case 'e':
12575 case 'E':
12576 case 'f':
12577 case 'F':
12578 case 'g':
12579 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012580 temp = formatfloat(v, flags, prec, c);
12581 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012582 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012583 if (PyUnicode_READY(temp) == -1) {
12584 Py_CLEAR(temp);
12585 goto onError;
12586 }
12587 pbuf = PyUnicode_DATA(temp);
12588 kind = PyUnicode_KIND(temp);
12589 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012590 sign = 1;
12591 if (flags & F_ZERO)
12592 fill = '0';
12593 break;
12594
12595 case 'c':
12596 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012598 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 if (len < 0)
12600 goto onError;
12601 break;
12602
12603 default:
12604 PyErr_Format(PyExc_ValueError,
12605 "unsupported format character '%c' (0x%x) "
12606 "at index %zd",
12607 (31<=c && c<=126) ? (char)c : '?',
12608 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 goto onError;
12611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 /* pbuf is initialized here. */
12613 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12616 PyUnicode_READ(kind, pbuf, pindex) == '+') {
12617 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 len--;
12619 }
12620 else if (flags & F_SIGN)
12621 sign = '+';
12622 else if (flags & F_BLANK)
12623 sign = ' ';
12624 else
12625 sign = 0;
12626 }
12627 if (width < len)
12628 width = len;
12629 if (rescnt - (sign != 0) < width) {
12630 reslen -= rescnt;
12631 rescnt = width + fmtcnt + 100;
12632 reslen += rescnt;
12633 if (reslen < 0) {
12634 Py_XDECREF(temp);
12635 PyErr_NoMemory();
12636 goto onError;
12637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12639 if (res0 == 0) {
12640 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 Py_XDECREF(temp);
12642 goto onError;
12643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012644 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012645 }
12646 if (sign) {
12647 if (fill != ' ')
12648 *res++ = sign;
12649 rescnt--;
12650 if (width > len)
12651 width--;
12652 }
12653 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12655 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000012656 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12658 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 }
12660 rescnt -= 2;
12661 width -= 2;
12662 if (width < 0)
12663 width = 0;
12664 len -= 2;
12665 }
12666 if (width > len && !(flags & F_LJUST)) {
12667 do {
12668 --rescnt;
12669 *res++ = fill;
12670 } while (--width > len);
12671 }
12672 if (fill == ' ') {
12673 if (sign)
12674 *res++ = sign;
12675 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12677 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12678 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12679 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012680 }
12681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 /* Copy all characters, preserving len */
12683 len1 = len;
12684 while (len1--) {
12685 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12686 rescnt--;
12687 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 while (--width >= len) {
12689 --rescnt;
12690 *res++ = ' ';
12691 }
12692 if (dict && (argidx < arglen) && c != '%') {
12693 PyErr_SetString(PyExc_TypeError,
12694 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000012695 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 goto onError;
12697 }
12698 Py_XDECREF(temp);
12699 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700 } /* until end */
12701 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 PyErr_SetString(PyExc_TypeError,
12703 "not all arguments converted during string formatting");
12704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705 }
12706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707
12708 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12709 if (*res > max)
12710 max = *res;
12711 result = PyUnicode_New(reslen - rescnt, max);
12712 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000012713 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 kind = PyUnicode_KIND(result);
12715 for (res = res0; res < res0+reslen-rescnt; res++)
12716 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12717 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 }
12721 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 return (PyObject *)result;
12723
Benjamin Peterson29060642009-01-31 22:14:21 +000012724 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 Py_DECREF(uformat);
12727 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012728 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729 }
12730 return NULL;
12731}
12732
Jeremy Hylton938ace62002-07-17 16:30:39 +000012733static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000012734unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12735
Tim Peters6d6c1a32001-08-02 04:15:00 +000012736static PyObject *
12737unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12738{
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012740 static char *kwlist[] = {"object", "encoding", "errors", 0};
12741 char *encoding = NULL;
12742 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000012743
Benjamin Peterson14339b62009-01-31 16:36:08 +000012744 if (type != &PyUnicode_Type)
12745 return unicode_subtype_new(type, args, kwds);
12746 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 return NULL;
12749 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012751 if (encoding == NULL && errors == NULL)
12752 return PyObject_Str(x);
12753 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000012755}
12756
Guido van Rossume023fe02001-08-30 03:12:59 +000012757static PyObject *
12758unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12759{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012760 PyUnicodeObject *unicode, *self;
12761 Py_ssize_t length, char_size;
12762 int share_wstr, share_utf8;
12763 unsigned int kind;
12764 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000012765
Benjamin Peterson14339b62009-01-31 16:36:08 +000012766 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012767
12768 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12769 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020012771 assert(_PyUnicode_CHECK(unicode));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012772 if (PyUnicode_READY(unicode))
12773 return NULL;
12774
12775 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12776 if (self == NULL) {
12777 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012778 return NULL;
12779 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012780 kind = PyUnicode_KIND(unicode);
12781 length = PyUnicode_GET_LENGTH(unicode);
12782
12783 _PyUnicode_LENGTH(self) = length;
12784 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12785 _PyUnicode_STATE(self).interned = 0;
12786 _PyUnicode_STATE(self).kind = kind;
12787 _PyUnicode_STATE(self).compact = 0;
12788 _PyUnicode_STATE(self).ascii = 0;
12789 _PyUnicode_STATE(self).ready = 1;
12790 _PyUnicode_WSTR(self) = NULL;
12791 _PyUnicode_UTF8_LENGTH(self) = 0;
12792 _PyUnicode_UTF8(self) = NULL;
12793 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020012794 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012795
12796 share_utf8 = 0;
12797 share_wstr = 0;
12798 if (kind == PyUnicode_1BYTE_KIND) {
12799 char_size = 1;
12800 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12801 share_utf8 = 1;
12802 }
12803 else if (kind == PyUnicode_2BYTE_KIND) {
12804 char_size = 2;
12805 if (sizeof(wchar_t) == 2)
12806 share_wstr = 1;
12807 }
12808 else {
12809 assert(kind == PyUnicode_4BYTE_KIND);
12810 char_size = 4;
12811 if (sizeof(wchar_t) == 4)
12812 share_wstr = 1;
12813 }
12814
12815 /* Ensure we won't overflow the length. */
12816 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12817 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012819 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012820 data = PyObject_MALLOC((length + 1) * char_size);
12821 if (data == NULL) {
12822 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012823 goto onError;
12824 }
12825
Victor Stinnerc3c74152011-10-02 20:39:55 +020012826 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012827 if (share_utf8) {
12828 _PyUnicode_UTF8_LENGTH(self) = length;
12829 _PyUnicode_UTF8(self) = data;
12830 }
12831 if (share_wstr) {
12832 _PyUnicode_WSTR_LENGTH(self) = length;
12833 _PyUnicode_WSTR(self) = (wchar_t *)data;
12834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835
Victor Stinner07ac3eb2011-10-01 16:16:43 +020012836 Py_MEMCPY(data, PyUnicode_DATA(unicode),
12837 PyUnicode_KIND_SIZE(kind, length + 1));
12838 Py_DECREF(unicode);
12839 return (PyObject *)self;
12840
12841onError:
12842 Py_DECREF(unicode);
12843 Py_DECREF(self);
12844 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000012845}
12846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012847PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000012849\n\
Collin Winterd474ce82007-08-07 19:42:11 +000012850Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000012851encoding defaults to the current default string encoding.\n\
12852errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000012853
Guido van Rossum50e9fb92006-08-17 05:42:55 +000012854static PyObject *unicode_iter(PyObject *seq);
12855
Guido van Rossumd57fd912000-03-10 22:53:23 +000012856PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000012857 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012858 "str", /* tp_name */
12859 sizeof(PyUnicodeObject), /* tp_size */
12860 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012861 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012862 (destructor)unicode_dealloc, /* tp_dealloc */
12863 0, /* tp_print */
12864 0, /* tp_getattr */
12865 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000012866 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012867 unicode_repr, /* tp_repr */
12868 &unicode_as_number, /* tp_as_number */
12869 &unicode_as_sequence, /* tp_as_sequence */
12870 &unicode_as_mapping, /* tp_as_mapping */
12871 (hashfunc) unicode_hash, /* tp_hash*/
12872 0, /* tp_call*/
12873 (reprfunc) unicode_str, /* tp_str */
12874 PyObject_GenericGetAttr, /* tp_getattro */
12875 0, /* tp_setattro */
12876 0, /* tp_as_buffer */
12877 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000012878 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 unicode_doc, /* tp_doc */
12880 0, /* tp_traverse */
12881 0, /* tp_clear */
12882 PyUnicode_RichCompare, /* tp_richcompare */
12883 0, /* tp_weaklistoffset */
12884 unicode_iter, /* tp_iter */
12885 0, /* tp_iternext */
12886 unicode_methods, /* tp_methods */
12887 0, /* tp_members */
12888 0, /* tp_getset */
12889 &PyBaseObject_Type, /* tp_base */
12890 0, /* tp_dict */
12891 0, /* tp_descr_get */
12892 0, /* tp_descr_set */
12893 0, /* tp_dictoffset */
12894 0, /* tp_init */
12895 0, /* tp_alloc */
12896 unicode_new, /* tp_new */
12897 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898};
12899
12900/* Initialize the Unicode implementation */
12901
Thomas Wouters78890102000-07-22 19:25:51 +000012902void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012904 int i;
12905
Thomas Wouters477c8d52006-05-27 19:21:47 +000012906 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012908 0x000A, /* LINE FEED */
12909 0x000D, /* CARRIAGE RETURN */
12910 0x001C, /* FILE SEPARATOR */
12911 0x001D, /* GROUP SEPARATOR */
12912 0x001E, /* RECORD SEPARATOR */
12913 0x0085, /* NEXT LINE */
12914 0x2028, /* LINE SEPARATOR */
12915 0x2029, /* PARAGRAPH SEPARATOR */
12916 };
12917
Fred Drakee4315f52000-05-09 19:53:39 +000012918 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020012919 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012920 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012922
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012923 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000012924 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000012925 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
12928 /* initialize the linebreak bloom filter */
12929 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020012931 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012932
12933 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934}
12935
12936/* Finalize the Unicode implementation */
12937
Christian Heimesa156e092008-02-16 07:38:31 +000012938int
12939PyUnicode_ClearFreeList(void)
12940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000012942}
12943
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944void
Thomas Wouters78890102000-07-22 19:25:51 +000012945_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012947 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000012949 Py_XDECREF(unicode_empty);
12950 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000012951
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012952 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012953 if (unicode_latin1[i]) {
12954 Py_DECREF(unicode_latin1[i]);
12955 unicode_latin1[i] = NULL;
12956 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000012957 }
Christian Heimesa156e092008-02-16 07:38:31 +000012958 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000012960
Walter Dörwald16807132007-05-25 13:52:07 +000012961void
12962PyUnicode_InternInPlace(PyObject **p)
12963{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
12965 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020012966#ifdef Py_DEBUG
12967 assert(s != NULL);
12968 assert(_PyUnicode_CHECK(s));
12969#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020012971 return;
12972#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 /* If it's a subclass, we don't really know what putting
12974 it in the interned dict might do. */
12975 if (!PyUnicode_CheckExact(s))
12976 return;
12977 if (PyUnicode_CHECK_INTERNED(s))
12978 return;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 if (PyUnicode_READY(s) == -1) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020012980 assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 return;
12982 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 if (interned == NULL) {
12984 interned = PyDict_New();
12985 if (interned == NULL) {
12986 PyErr_Clear(); /* Don't leave an exception */
12987 return;
12988 }
12989 }
12990 /* It might be that the GetItem call fails even
12991 though the key is present in the dictionary,
12992 namely when this happens during a stack overflow. */
12993 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012995 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000012996
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 if (t) {
12998 Py_INCREF(t);
12999 Py_DECREF(*p);
13000 *p = t;
13001 return;
13002 }
Walter Dörwald16807132007-05-25 13:52:07 +000013003
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 PyThreadState_GET()->recursion_critical = 1;
13005 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13006 PyErr_Clear();
13007 PyThreadState_GET()->recursion_critical = 0;
13008 return;
13009 }
13010 PyThreadState_GET()->recursion_critical = 0;
13011 /* The two references in interned are not counted by refcnt.
13012 The deallocator will take care of this */
13013 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013015}
13016
13017void
13018PyUnicode_InternImmortal(PyObject **p)
13019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13021
Benjamin Peterson14339b62009-01-31 16:36:08 +000013022 PyUnicode_InternInPlace(p);
13023 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013025 Py_INCREF(*p);
13026 }
Walter Dörwald16807132007-05-25 13:52:07 +000013027}
13028
13029PyObject *
13030PyUnicode_InternFromString(const char *cp)
13031{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013032 PyObject *s = PyUnicode_FromString(cp);
13033 if (s == NULL)
13034 return NULL;
13035 PyUnicode_InternInPlace(&s);
13036 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013037}
13038
Alexander Belopolsky40018472011-02-26 01:02:56 +000013039void
13040_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013041{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013042 PyObject *keys;
13043 PyUnicodeObject *s;
13044 Py_ssize_t i, n;
13045 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013046
Benjamin Peterson14339b62009-01-31 16:36:08 +000013047 if (interned == NULL || !PyDict_Check(interned))
13048 return;
13049 keys = PyDict_Keys(interned);
13050 if (keys == NULL || !PyList_Check(keys)) {
13051 PyErr_Clear();
13052 return;
13053 }
Walter Dörwald16807132007-05-25 13:52:07 +000013054
Benjamin Peterson14339b62009-01-31 16:36:08 +000013055 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13056 detector, interned unicode strings are not forcibly deallocated;
13057 rather, we give them their stolen references back, and then clear
13058 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013059
Benjamin Peterson14339b62009-01-31 16:36:08 +000013060 n = PyList_GET_SIZE(keys);
13061 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013063 for (i = 0; i < n; i++) {
13064 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 if (PyUnicode_READY(s) == -1)
13066 fprintf(stderr, "could not ready string\n");
13067 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013068 case SSTATE_NOT_INTERNED:
13069 /* XXX Shouldn't happen */
13070 break;
13071 case SSTATE_INTERNED_IMMORTAL:
13072 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013074 break;
13075 case SSTATE_INTERNED_MORTAL:
13076 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013078 break;
13079 default:
13080 Py_FatalError("Inconsistent interned string state.");
13081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013083 }
13084 fprintf(stderr, "total size of all interned strings: "
13085 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13086 "mortal/immortal\n", mortal_size, immortal_size);
13087 Py_DECREF(keys);
13088 PyDict_Clear(interned);
13089 Py_DECREF(interned);
13090 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013091}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013092
13093
13094/********************* Unicode Iterator **************************/
13095
13096typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013097 PyObject_HEAD
13098 Py_ssize_t it_index;
13099 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013100} unicodeiterobject;
13101
13102static void
13103unicodeiter_dealloc(unicodeiterobject *it)
13104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 _PyObject_GC_UNTRACK(it);
13106 Py_XDECREF(it->it_seq);
13107 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013108}
13109
13110static int
13111unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013113 Py_VISIT(it->it_seq);
13114 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013115}
13116
13117static PyObject *
13118unicodeiter_next(unicodeiterobject *it)
13119{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013120 PyUnicodeObject *seq;
13121 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013122
Benjamin Peterson14339b62009-01-31 16:36:08 +000013123 assert(it != NULL);
13124 seq = it->it_seq;
13125 if (seq == NULL)
13126 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013127 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13130 int kind = PyUnicode_KIND(seq);
13131 void *data = PyUnicode_DATA(seq);
13132 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13133 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 if (item != NULL)
13135 ++it->it_index;
13136 return item;
13137 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013138
Benjamin Peterson14339b62009-01-31 16:36:08 +000013139 Py_DECREF(seq);
13140 it->it_seq = NULL;
13141 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013142}
13143
13144static PyObject *
13145unicodeiter_len(unicodeiterobject *it)
13146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013147 Py_ssize_t len = 0;
13148 if (it->it_seq)
13149 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13150 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013151}
13152
13153PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13154
13155static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013159};
13160
13161PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13163 "str_iterator", /* tp_name */
13164 sizeof(unicodeiterobject), /* tp_basicsize */
13165 0, /* tp_itemsize */
13166 /* methods */
13167 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13168 0, /* tp_print */
13169 0, /* tp_getattr */
13170 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013171 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172 0, /* tp_repr */
13173 0, /* tp_as_number */
13174 0, /* tp_as_sequence */
13175 0, /* tp_as_mapping */
13176 0, /* tp_hash */
13177 0, /* tp_call */
13178 0, /* tp_str */
13179 PyObject_GenericGetAttr, /* tp_getattro */
13180 0, /* tp_setattro */
13181 0, /* tp_as_buffer */
13182 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13183 0, /* tp_doc */
13184 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13185 0, /* tp_clear */
13186 0, /* tp_richcompare */
13187 0, /* tp_weaklistoffset */
13188 PyObject_SelfIter, /* tp_iter */
13189 (iternextfunc)unicodeiter_next, /* tp_iternext */
13190 unicodeiter_methods, /* tp_methods */
13191 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013192};
13193
13194static PyObject *
13195unicode_iter(PyObject *seq)
13196{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013197 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013198
Benjamin Peterson14339b62009-01-31 16:36:08 +000013199 if (!PyUnicode_Check(seq)) {
13200 PyErr_BadInternalCall();
13201 return NULL;
13202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 if (PyUnicode_READY(seq) == -1)
13204 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13206 if (it == NULL)
13207 return NULL;
13208 it->it_index = 0;
13209 Py_INCREF(seq);
13210 it->it_seq = (PyUnicodeObject *)seq;
13211 _PyObject_GC_TRACK(it);
13212 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013213}
13214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215#define UNIOP(x) Py_UNICODE_##x
13216#define UNIOP_t Py_UNICODE
13217#include "uniops.h"
13218#undef UNIOP
13219#undef UNIOP_t
13220#define UNIOP(x) Py_UCS4_##x
13221#define UNIOP_t Py_UCS4
13222#include "uniops.h"
13223#undef UNIOP
13224#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013225
Victor Stinner71133ff2010-09-01 23:43:53 +000013226Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013227PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013228{
13229 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13230 Py_UNICODE *copy;
13231 Py_ssize_t size;
13232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 if (!PyUnicode_Check(unicode)) {
13234 PyErr_BadArgument();
13235 return NULL;
13236 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013237 /* Ensure we won't overflow the size. */
13238 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13239 PyErr_NoMemory();
13240 return NULL;
13241 }
13242 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13243 size *= sizeof(Py_UNICODE);
13244 copy = PyMem_Malloc(size);
13245 if (copy == NULL) {
13246 PyErr_NoMemory();
13247 return NULL;
13248 }
13249 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13250 return copy;
13251}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013252
Georg Brandl66c221e2010-10-14 07:04:07 +000013253/* A _string module, to export formatter_parser and formatter_field_name_split
13254 to the string.Formatter class implemented in Python. */
13255
13256static PyMethodDef _string_methods[] = {
13257 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13258 METH_O, PyDoc_STR("split the argument as a field name")},
13259 {"formatter_parser", (PyCFunction) formatter_parser,
13260 METH_O, PyDoc_STR("parse the argument as a format string")},
13261 {NULL, NULL}
13262};
13263
13264static struct PyModuleDef _string_module = {
13265 PyModuleDef_HEAD_INIT,
13266 "_string",
13267 PyDoc_STR("string helper module"),
13268 0,
13269 _string_methods,
13270 NULL,
13271 NULL,
13272 NULL,
13273 NULL
13274};
13275
13276PyMODINIT_FUNC
13277PyInit__string(void)
13278{
13279 return PyModule_Create(&_string_module);
13280}
13281
13282
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013283#ifdef __cplusplus
13284}
13285#endif